def main(): args = parse_args() cfg.set_args(args.gpu_ids) cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True tester = Tester(args.backbone) tester._make_batch_generator() for epoch in range(args.model_epoch[0], args.model_epoch[1]): tester._make_model(epoch) preds = [] with torch.no_grad(): for itr, input_img in enumerate(tqdm(tester.batch_generator)): # forward coord_out = tester.model(input_img) if cfg.flip_test: flipped_input_img = flip(input_img, dims=3) flipped_coord_out = tester.model(flipped_input_img) flipped_coord_out[:, :, 0] = cfg.output_shape[ 1] - flipped_coord_out[:, :, 0] - 1 for pair in tester.flip_pairs: flipped_coord_out[:, pair[ 0], :], flipped_coord_out[:, pair[ 1], :] = flipped_coord_out[:, pair[1], :].clone( ), flipped_coord_out[:, pair[0], :].clone() coord_out = (coord_out + flipped_coord_out) / 2. vis = False if vis: filename = str(itr) tmpimg = input_img[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape( 3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) tmpimg = tmpimg.astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy() tmpkps = np.zeros((3, tester.joint_num)) tmpkps[:2, :] = coord_out[ 0, :, :2].cpu().numpy().transpose( 1, 0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, tester.skeleton) cv2.imwrite(filename + '_output.jpg', tmpimg) coord_out = coord_out.cpu().numpy() preds.append(coord_out) # evaluate preds = np.concatenate(preds, axis=0) tester._evaluate(preds, cfg.result_dir)
def evaluate(self, preds, result_dir): print('Evaluation start...') gts = self.data sample_num = len(preds) joint_num = self.original_joint_num pred_2d_save = {} pred_3d_save = {} for n in range(sample_num): gt = gts[n] f = gt['f'] c = gt['c'] bbox = gt['bbox'] gt_3d_root = gt['root_cam'] img_name = gt['img_path'].split('/') img_name = img_name[-2] + '_' + img_name[-1].split('.')[0] # e.g., TS1_img_0001 # restore coordinates to original space pred_2d_kpt = preds[n].copy() # only consider eval_joint pred_2d_kpt = np.take(pred_2d_kpt, self.eval_joint, axis=0) pred_2d_kpt[:,0] = pred_2d_kpt[:,0] / cfg.output_shape[1] * bbox[2] + bbox[0] pred_2d_kpt[:,1] = pred_2d_kpt[:,1] / cfg.output_shape[0] * bbox[3] + bbox[1] pred_2d_kpt[:,2] = (pred_2d_kpt[:,2] / cfg.depth_dim * 2 - 1) * (cfg.bbox_3d_shape[0]/2) + gt_3d_root[2] # 2d kpt save if img_name in pred_2d_save: pred_2d_save[img_name].append(pred_2d_kpt[:,:2]) else: pred_2d_save[img_name] = [pred_2d_kpt[:,:2]] vis = False if vis: cvimg = cv2.imread(gt['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) filename = str(random.randrange(1,500)) tmpimg = cvimg.copy().astype(np.uint8) tmpkps = np.zeros((3,joint_num)) tmpkps[0,:], tmpkps[1,:] = pred_2d_kpt[:,0], pred_2d_kpt[:,1] tmpkps[2,:] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton) cv2.imwrite(filename + '_output.jpg', tmpimg) # back project to camera coordinate system pred_3d_kpt = np.zeros((joint_num,3)) = pixel2cam(pred_2d_kpt, f, c) # 3d kpt save if img_name in pred_3d_save: pred_3d_save[img_name].append(pred_3d_kpt) else: pred_3d_save[img_name] = [pred_3d_kpt] output_path = osp.join(result_dir,'preds_2d_kpt_mupots.mat') sio.savemat(output_path, pred_2d_save) print("Testing result is saved at " + output_path) output_path = osp.join(result_dir,'preds_3d_kpt_mupots.mat') sio.savemat(output_path, pred_3d_save) print("Testing result is saved at " + output_path)
def evaluate(self, preds, result_dir): print('Evaluation start...') gts = self.data assert len(gts) == len(preds) sample_num = len(gts) joint_num = self.joint_num pred_save = [] for n in range(sample_num): gt = gts[n] image_id = gt['img_id'] f = gt['f'] c = gt['c'] bbox = gt['bbox'] gt_3d_root = gt['root_cam'] gt_3d_kpt = gt['joint_cam'] gt_vis = gt['joint_vis'] # restore coordinates to original space pre_2d_kpt = preds[n].copy() pre_2d_kpt[:,0], pre_2d_kpt[:,1], pre_2d_kpt[:,2] = warp_coord_to_original(pre_2d_kpt, bbox, gt_3d_root) vis = False if vis: cvimg = cv2.imread(gt['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) filename = str(random.randrange(1,500)) tmpimg = cvimg.copy().astype(np.uint8) tmpkps = np.zeros((3,joint_num)) tmpkps[0,:], tmpkps[1,:] = pre_2d_kpt[:,0], pre_2d_kpt[:,1] tmpkps[2,:] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton) cv2.imwrite(filename + '_output.jpg', tmpimg) # back project to camera coordinate system pred_3d_kpt = np.zeros((joint_num,3)) pred_3d_kpt[:,0], pred_3d_kpt[:,1], pred_3d_kpt[:,2] = pixel2cam(pre_2d_kpt, f, c) # root joint alignment pred_3d_kpt = pred_3d_kpt - pred_3d_kpt[self.root_idx] gt_3d_kpt = gt_3d_kpt - gt_3d_kpt[self.root_idx] # rigid alignment for PA MPJPE (protocol #1) pred_3d_kpt_align = rigid_align(pred_3d_kpt, gt_3d_kpt) # exclude thorax pred_3d_kpt = np.take(pred_3d_kpt, self.eval_joint, axis=0) pred_3d_kpt_align = np.take(pred_3d_kpt_align, self.eval_joint, axis=0) # prediction save pred_save.append({'image_id': image_id, 'joint_cam': pred_3d_kpt.tolist(), 'joint_cam_aligned': pred_3d_kpt_align.tolist(), 'bbox': bbox.tolist(), 'root_cam': gt_3d_root.tolist()}) # joint_cams are root-relative output_path = osp.join(result_dir, 'bbox_root_pose_human36m_output.json') with open(output_path, 'w') as f: json.dump(pred_save, f) print("Test result is saved at " + output_path) calculate_score(output_path, self.annot_dir, self.subject_list)
def show_pose(input_patch, img_patch, coord_out, skeleton, save_path, frame): tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg,(1,2,0)).copy() tmpkps = np.zeros((3,18)) tmpkps[:2,:] = coord_out[0,:,:2].transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2,:] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize(tmpimg,(img_patch.shape[1],img_patch.shape[0])) file_name = save_path+'\\{0}.png'.format(str(frame).zfill(2)) cv2.imwrite(file_name, tmpimg)
def train(loader, ds_rd, model, criterion, optimizer, epoch, n_iter=-1, logger=None, opts=None, visualizer=None): ''' iter through epoch , return rst{'acc', loss'} each as list can be used outside for updating. :param loader: :param model: :param criterion: :param optimizer: :param epoch: for print infor :param n_iter: the iteration wanted, -1 for all iters :param opts: keep some additional controls :param visualizer: for visualizer :return: ''' batch_time = ut.AverageMeter() data_time = ut.AverageMeter() losses = ut.AverageMeter() acc = ut.AverageMeter() # switch to train mode model.train() end = time.time() li_loss = [] li_acc = [] for i, inp_dct in enumerate(loader): # get items if i >= n_iter and n_iter > 0: # break if iter is set and i is greater than that break input = inp_dct['pch'] target = inp_dct['hms'] # 14 x 64 x 1?? target_weight = inp_dct['joints_vis'] # measure data loading time weight, visible or not data_time.update(time.time() - end) # compute output outputs = model(input) # no need to cuda it? target = target.cuda(non_blocking=True) target_weight = target_weight.cuda(non_blocking=True) if isinstance(outputs, list): # list multiple stage version loss = criterion(outputs[0], target, target_weight) for output in outputs[1:]: loss += criterion(output, target, target_weight) else: output = outputs loss = criterion(output, target, target_weight) # compute gradient and do update step optimizer.zero_grad() loss.backward() optimizer.step() # measure accuracy and record loss losses.update(loss.item(), input.size(0)) _, avg_acc, cnt, pred = accuracy( output.detach().cpu().numpy(), target.detach().cpu().numpy() ) # hm directly, with normalize with 1/10 dim, pck0.5, cnt: n_smp, pred acc.update(avg_acc, cnt) # keep average acc if visualizer and 0 == i % opts.update_html_freq: # update current result, get vis dict n_jt = ds_rd.joint_num_ori mod0 = opts.mod_src[0] mean = ds_rd.means[mod0] std = ds_rd.stds[mod0] img_patch_vis = ut.ts2cv2( input[0], mean, std) # to CV BGR, mean std control channel detach inside # pseudo change cm = getattr(cv2, ds_rd.dct_clrMap[mod0]) img_patch_vis = cv2.applyColorMap(img_patch_vis, cm)[..., ::-1] # RGB # get pred pred2d_patch = np.ones((n_jt, 3)) # 3rd for vis pred2d_patch[:, :2] = pred[0] / opts.out_shp[0] * opts.sz_pch[1] img_skel = vis.vis_keypoints(img_patch_vis, pred2d_patch, ds_rd.skels_idx) hm_gt = target[0].cpu().detach().numpy().sum(axis=0) # HXW hm_gt = ut.normImg(hm_gt) hm_pred = output[0].detach().cpu().numpy().sum(axis=0) hm_pred = ut.normImg(hm_pred) img_cb = vis.hconcat_resize([img_skel, hm_gt, hm_pred]) vis_dict = {'img_cb': img_cb} visualizer.display_current_results(vis_dict, epoch, False) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % opts.print_freq == 0: msg = 'Epoch: [{0}][{1}/{2}]\t' \ 'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \ 'Speed {speed:.1f} samples/s\t' \ 'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \ 'Loss {loss.val:.5f} ({loss.avg:.5f})\t' \ 'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i, len(loader), batch_time=batch_time, speed=input.size(0) / batch_time.val, data_time=data_time, loss=losses, acc=acc) logger.info(msg) li_loss.append(losses.val) # the current loss li_acc.append(acc.val) return {'losses': li_loss, 'accs': li_acc}
def evaluate(self, preds, result_dir): print('Evaluation start...') gts = self.data assert len(gts) == len(preds) sample_num = len(gts) pred_save = [] error = np.zeros((sample_num, self.joint_num - 1)) # joint error error_action = [[] for _ in range(len(self.action_name)) ] # error for each sequence for n in range(sample_num): gt = gts[n] image_id = gt['img_id'] f = gt['f'] c = gt['c'] bbox = gt['bbox'] gt_3d_root = gt['root_cam'] gt_3d_kpt = gt['joint_cam'] gt_vis = gt['joint_vis'] # restore coordinates to original space pred_2d_kpt = preds[n].copy() pred_2d_kpt[:, 0] = pred_2d_kpt[:, 0] / cfg.output_shape[1] * bbox[ 2] + bbox[0] pred_2d_kpt[:, 1] = pred_2d_kpt[:, 1] / cfg.output_shape[0] * bbox[ 3] + bbox[1] pred_2d_kpt[:, 2] = (pred_2d_kpt[:, 2] / cfg.depth_dim * 2 - 1) * (cfg.bbox_3d_shape[0] / 2) + gt_3d_root[2] vis = False if vis: cvimg = cv2.imread( gt['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) filename = str(random.randrange(1, 500)) tmpimg = cvimg.copy().astype(np.uint8) tmpkps = np.zeros((3, self.joint_num)) tmpkps[0, :], tmpkps[1, :] = pred_2d_kpt[:, 0], pred_2d_kpt[:, 1] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton) cv2.imwrite(filename + '_output.jpg', tmpimg) # back project to camera coordinate system pred_3d_kpt = pixel2cam(pred_2d_kpt, f, c) # root joint alignment pred_3d_kpt = pred_3d_kpt - pred_3d_kpt[self.root_idx] gt_3d_kpt = gt_3d_kpt - gt_3d_kpt[self.root_idx] if self.protocol == 1: # rigid alignment for PA MPJPE (protocol #1) pred_3d_kpt = rigid_align(pred_3d_kpt, gt_3d_kpt) # exclude thorax pred_3d_kpt = np.take(pred_3d_kpt, self.eval_joint, axis=0) gt_3d_kpt = np.take(gt_3d_kpt, self.eval_joint, axis=0) # error calculate error[n] = np.sqrt(np.sum((pred_3d_kpt - gt_3d_kpt)**2, 1)) img_name = gt['img_path'] action_idx = int(img_name[img_name.find('act') + 4:img_name.find('act') + 6]) - 2 error_action[action_idx].append(error[n].copy()) # prediction save pred_save.append({ 'image_id': image_id, 'joint_cam': pred_3d_kpt.tolist(), 'bbox': bbox.tolist(), 'root_cam': gt_3d_root.tolist() }) # joint_cam is root-relative coordinate # total error tot_err = np.mean(error) metric = 'PA MPJPE' if self.protocol == 1 else 'MPJPE' eval_summary = 'Protocol ' + str( self.protocol) + ' error (' + metric + ') >> tot: %.2f\n' % ( tot_err) # error for each action for i in range(len(error_action)): err = np.mean(np.array(error_action[i])) eval_summary += (self.action_name[i] + ': %.2f ' % err) print(eval_summary) # prediction save output_path = osp.join(result_dir, 'bbox_root_pose_human36m_output.json') with open(output_path, 'w') as f: json.dump(pred_save, f) print("Test result is saved at " + output_path)
pose_3d[:, 0] = pose_3d[:, 0] / cfg.output_shape[1] * cfg.input_shape[1] pose_3d[:, 1] = pose_3d[:, 1] / cfg.output_shape[0] * cfg.input_shape[0] pose_3d_xy1 = np.concatenate( (pose_3d[:, :2], np.ones_like(pose_3d[:, :1])), 1) img2bb_trans_001 = np.concatenate( (img2bb_trans, np.array([0, 0, 1]).reshape(1, 3))) pose_3d[:, :2] = np.dot(np.linalg.inv(img2bb_trans_001), pose_3d_xy1.transpose(1, 0)).transpose(1, 0)[:, :2] output_pose_2d_list.append(pose_3d[:, :2].copy()) # root-relative discretized depth -> absolute continuous depth pose_3d[:, 2] = (pose_3d[:, 2] / cfg.depth_dim * 2 - 1) * (cfg.bbox_3d_shape[0] / 2) + root_depth_list[n] pose_3d = pixel2cam(pose_3d, focal, princpt) output_pose_3d_list.append(pose_3d.copy()) # visualize 2d poses vis_img = original_img.copy() for n in range(person_num): vis_kps = np.zeros((3, joint_num)) vis_kps[0, :] = output_pose_2d_list[n][:, 0] vis_kps[1, :] = output_pose_2d_list[n][:, 1] vis_kps[2, :] = 1 vis_img = vis_keypoints(vis_img, vis_kps, skeleton) cv2.imwrite('output_pose_2d.jpg', vis_img) # visualize 3d poses vis_kps = np.array(output_pose_3d_list) vis_3d_multiple_skeleton(vis_kps, np.ones_like(vis_kps), skeleton, 'output_pose_3d (x,y,z: camera-centered. mm.)')
def evaluate(self, preds, result_dir): print() print('Evaluation start...') gts = self.load_data() assert len(gts) == len(preds) sample_num = len(gts) joint_num = self.joint_num p1_error = np.zeros( (sample_num, joint_num, 3)) # PA MPJPE (protocol #1 metric) p2_error = np.zeros( (sample_num, joint_num, 3)) # MPJPE (protocol #2 metroc) p1_error_action = [[] for _ in range(len(self.action_idx)) ] # PA MPJPE for each action p2_error_action = [[] for _ in range(len(self.action_idx)) ] # MPJPE error for each action pred_to_save = [] for n in range(sample_num): gt = gts[n] f = gt['f'] c = gt['c'] bbox = gt['bbox'] gt_3d_center = gt['center_cam'] gt_3d_kpt = gt['joint_cam'] gt_vis = gt['joint_vis'].copy() # restore coordinates to original space pre_2d_kpt = preds[n].copy() pre_2d_kpt[:, 0], pre_2d_kpt[:, 1], pre_2d_kpt[:, 2] = warp_coord_to_original( pre_2d_kpt, bbox, gt_3d_center) vis = False if vis: cvimg = cv2.imread( gt['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) filename = str(random.randrange(1, 500)) tmpimg = cvimg.copy().astype(np.uint8) tmpkps = np.zeros((3, joint_num)) tmpkps[0, :], tmpkps[1, :] = pre_2d_kpt[:, 0], pre_2d_kpt[:, 1] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, self.skeleton) cv2.imwrite(osp.join(cfg.vis_dir, filename + '_output.jpg'), tmpimg) # back project to camera coordinate system pre_3d_kpt = np.zeros((joint_num, 3)) pre_3d_kpt[:, 0], pre_3d_kpt[:, 1], pre_3d_kpt[:, 2] = pixel2cam( pre_2d_kpt, f, c) vis = False if vis: vis_3d_skeleton(pre_3d_kpt, gt_vis, self.skeleton, filename) # root joint alignment pre_3d_kpt = pre_3d_kpt - pre_3d_kpt[self.root_idx] gt_3d_kpt = gt_3d_kpt - gt_3d_kpt[self.root_idx] # rigid alignment for PA MPJPE (protocol #1) pre_3d_kpt_align = rigid_align(pre_3d_kpt, gt_3d_kpt) # prediction save pred_to_save.append({ 'pred': pre_3d_kpt, 'align_pred': pre_3d_kpt_align, 'gt': gt_3d_kpt }) # error save p1_error[n] = np.power(pre_3d_kpt_align - gt_3d_kpt, 2) # PA MPJPE (protocol #1) p2_error[n] = np.power(pre_3d_kpt - gt_3d_kpt, 2) # MPJPE (protocol #2) img_name = gt['img_path'] action_idx = int(img_name[img_name.find('act') + 4:img_name.find('act') + 6]) - 2 p1_error_action[action_idx].append(p1_error[n].copy()) p2_error_action[action_idx].append(p2_error[n].copy()) # total error calculate p1_error = np.take(p1_error, self.eval_joint, axis=1) p2_error = np.take(p2_error, self.eval_joint, axis=1) p1_error = np.mean(np.power(np.sum(p1_error, axis=2), 0.5)) p2_error = np.mean(np.power(np.sum(p2_error, axis=2), 0.5)) p1_eval_summary = 'Protocol #1 error (PA MPJPE) >> %.2f' % (p1_error) p2_eval_summary = 'Protocol #2 error (MPJPE) >> %.2f' % (p2_error) print() print(p1_eval_summary) print(p2_eval_summary) # error for each action calculate p1_action_eval_summary = 'Protocol #1 error (PA MPJPE) for each action: \n' for i in range(len(p1_error_action)): err = np.array(p1_error_action[i]) err = np.take(err, self.eval_joint, axis=1) err = np.mean(np.power(np.sum(err, axis=2), 0.5)) action_name = self.action_name[i] p1_action_eval_summary += (action_name + ': %.2f\n' % err) p2_action_eval_summary = 'Protocol #2 error (MPJPE) for each action: \n' for i in range(len(p2_error_action)): err = np.array(p2_error_action[i]) err = np.take(err, self.eval_joint, axis=1) err = np.mean(np.power(np.sum(err, axis=2), 0.5)) action_name = self.action_name[i] p2_action_eval_summary += (action_name + ': %.2f\n' % err) print() print(p1_action_eval_summary) print(p2_action_eval_summary) # result save f_pred_3d_kpt = open(osp.join(result_dir, 'pred_3d_kpt.txt'), 'w') f_pred_3d_kpt_align = open( osp.join(result_dir, 'pred_3d_kpt_align.txt'), 'w') f_gt_3d_kpt = open(osp.join(result_dir, 'gt_3d_kpt.txt'), 'w') for i in range(len(pred_to_save)): for j in range(joint_num): for k in range(3): f_pred_3d_kpt.write('%.3f ' % pred_to_save[i]['pred'][j][k]) f_pred_3d_kpt_align.write( '%.3f ' % pred_to_save[i]['align_pred'][j][k]) f_gt_3d_kpt.write('%.3f ' % pred_to_save[i]['gt'][j][k]) f_pred_3d_kpt.write('\n') f_pred_3d_kpt_align.write('\n') f_gt_3d_kpt.write('\n') f_pred_3d_kpt.close() f_pred_3d_kpt_align.close() f_gt_3d_kpt.close() f_eval_result = open(osp.join(result_dir, 'eval_result.txt'), 'w') f_eval_result.write(p1_eval_summary) f_eval_result.write('\n') f_eval_result.write(p2_eval_summary) f_eval_result.write('\n') f_eval_result.write(p1_action_eval_summary) f_eval_result.write('\n') f_eval_result.write(p2_action_eval_summary) f_eval_result.write('\n') f_eval_result.close()
def visualize(self, preds, filename, img_path, inv_trans): joint_valid = [1.0]*21 + [1.0]*21 #change 1.0 to 0 if that hand is not resent right hand is comes first in output focal = [1500, 1500] # x-axis, y-axis princpt = [256/2, 256/2] root_joint_idx = {'right': 20, 'left': 41} preds_joint_coord, preds_rel_root_depth, preds_hand_type = preds['joint_coord'], preds['rel_root_depth'], preds['hand_type'] print(preds_hand_type) # img = load_image(img_path) # inv_trans = augmentation(img, # np.array(bbox), # preds_joint_coord, # joint_valid, # preds_hand_type, # 'test', # None) pred_joint_coord_img = preds_joint_coord[0].copy() pred_joint_coord_img[:,0] = pred_joint_coord_img[:,0]/cfg.output_hm_shape[2]*cfg.input_img_shape[1] pred_joint_coord_img[:,1] = pred_joint_coord_img[:,1]/cfg.output_hm_shape[1]*cfg.input_img_shape[0] for j in range(21*2): pred_joint_coord_img[j,:2] = trans_point2d(pred_joint_coord_img[j,:2], inv_trans) pred_joint_coord_img[:,2] = (pred_joint_coord_img[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2) # if preds_hand_type[0][0] == 0.9 and preds_hand_type[0][1] == 0.9: #change threshold to execute this parth if both handa are present # pred_rel_root_depth = (preds_rel_root_depth[0]/cfg.output_root_hm_shape * 2 - 1) * (cfg.bbox_3d_size_root/2) # pred_left_root_img = pred_joint_coord_img[root_joint_idx['left']].copy() # pred_left_root_img[2] += pred_rel_root_depth # pred_left_root_cam = pixel2cam(pred_left_root_img[None,:], focal, princpt)[0] # pred_right_root_img = pred_joint_coord_img[root_joint_idx['right']].copy() # pred_right_root_cam = pixel2cam(pred_right_root_img[None,:], focal, princpt)[0] # pred_rel_root = pred_left_root_cam - pred_right_root_cam # pred_joint_coord_cam = pixel2cam(pred_joint_coord_img, focal, princpt) # joint_type = {'right': np.arange(0,21), 'left': np.arange(21,21*2)} # for h in ('right', 'left'): # pred_joint_coord_cam[joint_type[h]] = pred_joint_coord_cam[joint_type[h]] - pred_joint_coord_cam[root_joint_idx[h],None,:] # print("^^^^^^^^^^^^^^^^^^^^^^^^^^^") # print(pred_joint_coord_cam.shape) print(img_path) cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) _img = cvimg[:,:,::-1].transpose(2,0,1) vis_kps = pred_joint_coord_img.copy() vis_valid = joint_valid.copy() filename = img_path.replace(".jpg", "2D.jpg").replace("main/custom_data", "custom_output") vis_keypoints(_img, vis_kps, joint_valid, self.skeleton, filename) filename = img_path.replace(".jpg", "3D.jpg").replace("main/custom_data", "custom_output") vis_3d_keypoints(pred_joint_coord_img, joint_valid, self.skeleton, filename) print("Finished Processing Image!!!!" + "\n")
# handedness joint_valid = np.zeros((joint_num * 2), dtype=np.float32) right_exist = False if hand_type[0] > 0.5: right_exist = True joint_valid[joint_type['right']] = 1 left_exist = False if hand_type[1] > 0.5: left_exist = True joint_valid[joint_type['left']] = 1 print('Right hand exist: ' + str(right_exist) + ' Left hand exist: ' + str(left_exist)) # visualize joint coord in 2D space filename = 'result_2d.jpg' vis_img = original_img.copy()[:, :, ::-1].transpose(2, 0, 1) vis_img = vis_keypoints(vis_img, joint_coord, joint_valid, skeleton, filename, save_path='.') # visualize joint coord in 3D space # The 3D coordinate in here consists of x,y pixel and z root-relative depth. # To make x,y, and z in real unit (e.g., mm), you need to know camera intrincis and root depth. # The root depth can be obtained from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE) filename = 'result_3d' vis_3d_keypoints(joint_coord, joint_valid, skeleton, filename)
def main(): video_list = ['Cant stop the feeling - Justin Timberlake - Easy Dance for Kids', 'Dance like yo daddy', 'Danny Ocean - Baby I Wont', 'Si una vez - If I Once', 'Vaiven - MegaMix'] for video in video_list: video_dir = 'dance_videos\\' + video + '.mp4' beat_dir = video_dir.strip('mp4') + 'npy' cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True time_0 = time.time() tester = Tester(24) ##loading 3D pose estimation model tester._make_model() time_1 = time.time() print('loading integral pose model elapse:',round(time_1-time_0,2),'s') ##loading yolo detector detector = YOLOv3( model_def="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg", class_path="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names", weights_path="3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights", classes=('person',), max_batch_size=16, device=torch.device('cuda:{}'.format(cfg.gpu_ids[0]))) print('loading yolo elapse:',round(time.time()-time_1,2),'s') skeleton = ( (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6) ) #fig = plt.figure(figsize=(10,10)) transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)] ) if not os.path.exists(video_dir.strip('mp4')+'wav'): videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio audioclip.write_audiofile(video_dir.strip('mp4')+'wav') video = cv2.VideoCapture(video_dir) if not os.path.exists(beat_dir): time_2 = time.time() videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio beat_activation = RNNBeatProcessor()(video_dir.strip('mp4')+'wav') processor = DBNBeatTrackingProcessor(fps=100) beats = processor(beat_activation) frames_at_beat = (beats/audioclip.duration*video.get(cv2.CAP_PROP_FRAME_COUNT)).astype(int) print('extracting beat sequence elapse:', round(time.time()-time_2, 2), 's') np.save(beat_dir, frames_at_beat) frames_at_beat = np.load(beat_dir).tolist() ########################################## dance_primitives_dir = '.\\danceprimitives_trial' if not os.path.exists(dance_primitives_dir): os.mkdir(dance_primitives_dir) motion_index = len(os.listdir(dance_primitives_dir)) for i in range(len(frames_at_beat)-1): motion_dir = os.path.join(dance_primitives_dir, '{0}'.format(str(motion_index).zfill(5))) if not os.path.exists(motion_dir): os.mkdir(motion_dir) start = frames_at_beat[i] end =frames_at_beat[i+1] dance_primitive = np.empty((0, 17*3)) # for motion control #dance_primitive_norm = np.empty((0, 17*3)) # for motion clustering video.set(1, start) jump_flag = 0 frame = 0 with torch.no_grad(): time_start = time.time() while True: current_frame = video.get(cv2.CAP_PROP_POS_FRAMES) ret_val, raw_image = video.read() if current_frame == end: break ##using yolo to get human bounding box input_img = raw_image.copy() detections = detector.predict_single(input_img) if detections is None or detections.size()[0] == 0: jump_flag = 1 break last_conf = 0 for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections): if conf.item() > last_conf: x1 = max(int(round(x1_pred.item())) - 40, 0) x2 = min(int(round(x2_pred.item())) + 40, input_img.shape[1]-1) y1 = max(int(round(y1_pred.item())) - 20, 0) y2 = min(int(round(y2_pred.item())) + 20, input_img.shape[0]-1) #for getting a larger bounding box to cover the full body, in order to get more accurate pose last_conf = conf.item() img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) ##using ResPoseNet to get 3D human pose input_patch = cv2.resize(img_patch,(cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch).cpu().numpy() #dimention: 1 X 18 X 3, where '3' refers to x, z, y in sequence. #show_pose(input_patch, img_patch, coord_out, skeleton, motion_dir, frame) coord_out_resize = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) #transform to original scale coord_out = coord_out_resize[:, :-1, :] # neglect the key point for "throx" #coord_out_norm = (coord_out-np.mean(coord_out, axis=1))/np.std(coord_out, axis=1) dance_primitive = np.vstack((dance_primitive, np.reshape(coord_out[0], -1))) #dance_primitive_norm = np.vstack((dance_primitive_norm, np.reshape(coord_out_norm[0], -1))) frame += 1 print('Processing Time Elapse:', round(time.time()-time_start,2), 's') if jump_flag == 1: continue #norm_sample = np.empty((0, 17*3)) #num_sample = 10 #print(dance_primitive_norm.shape[0]) #sample_step = (dance_primitive_norm.shape[0]-1)/(num_sample-1) #for i in range(num_sample): # norm_sample = np.vstack((norm_sample, dance_primitive_norm[round(i * sample_step)])) #print(norm_sample.shape) print(dance_primitive.shape) #np.save(os.path.join(motion_dir, 'dance_motion_normlized_'+ str(motion_index)), norm_sample) np.save(os.path.join(motion_dir, 'dance_motion_'+ str(motion_index)), dance_primitive) motion_index+=1 ########################################### sys.exit() video.set(1, interval[0]) frame=0 next_beat = 0 last_beat = 0 num_beat = 0 num_frame_between_beats = [] with torch.no_grad(): while True: time_start = time.time() current_frame = video.get(cv2.CAP_PROP_POS_FRAMES) ret_val, raw_image = video.read() if current_frame == interval[1]: break input_img = raw_image.copy() ##using yolo to get human bounding box detections = detector.predict_single(input_img) # if not detections.cpu().numpy().all(): # detections = (0,0,input_img.shape[1],input_img.shape[0],1,1) # print('not detected') if detections is None: detections = np.array([[0,0,input_img.shape[1],input_img.shape[0],1,1,1]]) print('not detected') elif detections.size()[0] == 0: detections = np.array([[0,0,input_img.shape[1],input_img.shape[0],1,1,1]]) print('not detected') last_conf = 0 last_last_conf = 0 for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections): if conf.item() > last_conf: x1 = int(round(x1_pred.item())) - 40 x2 = int(round(x2_pred.item())) + 40 y1 = int(round(y1_pred.item())) - 20 y2 = int(round(y2_pred.item())) + 20 #for getting a larger bounding box to cover the full body, in order to get more accurate pose last_last_conf = last_conf last_conf = conf.item() print(last_conf, last_last_conf) if last_last_conf != 0: sys.exit() #print(x1, x2, y1, y2, last_conf) img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) input_patch = cv2.resize(img_patch,(cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch) print('Running model time:',round(time.time()-time_start,2),'s') motion['frame'][frame] = {} if frame+interval[0] in frames_at_beat: motion['frame'][frame]['next_beat'] = 0 motion['frame'][frame]['last_beat'] = 0 #frames_at_beat.remove(frame) next_beat = frames_at_beat.index(frame+interval[0]) + 1 last_beat = frames_at_beat.index(frame+interval[0]) num_beat += 1 num_frame_between_beats.append(frames_at_beat[next_beat] - frames_at_beat[last_beat]) print('Record key frame with beat:', current_frame) else: motion['frame'][frame]['next_beat'] = frames_at_beat[next_beat] - (frame+interval[0]) motion['frame'][frame]['last_beat'] = (frame+interval[0]) - frames_at_beat[last_beat] coord_out = coord_out.cpu().numpy() coord_out_resize = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) for idx in range(coord_out_resize.shape[1]-1): motion['frame'][frame][idx]=(coord_out_resize[0][idx][0].item(), coord_out_resize[0][idx][2].item(), coord_out_resize[0][idx][1].item()) vis = True vis_3d = False if vis: tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg,(1,2,0)).copy() tmpkps = np.zeros((3,18)) tmpkps[:2,:] = coord_out[0,:,:2].transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2,:] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize(tmpimg,(img_patch.shape[1],img_patch.shape[0])) file_name = pose_save_dir+'\\{0}.png'.format(str(frame).zfill(4)) cv2.imwrite(file_name, tmpimg) if vis_3d: #coord_out = coord_out.cpu().numpy() #coord_out = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) pred=coord_out_resize.squeeze() #remove first batch dimension ax=plt.subplot('121',projection='3d') plt.axis('off') show3D_pose(pred,ax,skeleton,radius=40) file_name = pose_save_dir + '\\{0}.png'.format(str(frame).zfill(4)) plt.savefig(file_name) # cv2.imwrite(file_name, tmpimg) frame+=1 print('Processing Frame:',round(time.time()-time_start,2),'s') motion['feature']['fpb'] = np.mean(num_frame_between_beats) if REDU: motion_base[len(motion_base)-1] = motion else: motion_base[len(motion_base)] = motion #with open(motion_base_dir, 'w') as f: # json.dump(motion_base, f) print('done with', num_beat + 1, 'beats! (This should be even for a normal dance)') print('num_frame between beats:') print(num_frame_between_beats)
def __getitem__(self, index): if self.multiple_db: db_idx = index // max([len(db) for db in self.db]) joint_num = self.joint_num[db_idx] skeleton = self.skeleton[db_idx] lr_skeleton = self.lr_skeleton[0] flip_pairs = self.flip_pairs[db_idx] joints_have_depth = self.joints_have_depth[db_idx] ref_joints_name = self.joints_name[0] joints_name = self.joints_name[db_idx] item_idx = index % max([len(db) for db in self.db]) % len(self.db[db_idx]) data = copy.deepcopy(self.db[db_idx][item_idx]) else: joint_num = self.joint_num skeleton = self.skeleton lr_skeleton = self.lr_skeleton flip_pairs = self.flip_pairs joints_have_depth = self.joints_have_depth data = copy.deepcopy(self.db[index]) bbox = data['bbox'] joint_img = data['joint_img'] joint_vis = data['joint_vis'] # 1. load image cvimg = cv2.imread(data['img_path'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) if not isinstance(cvimg, np.ndarray): raise IOError("Fail to read %s" % data['img_path']) img_height, img_width, img_channels = cvimg.shape # 2. get augmentation params if self.do_augment: scale, rot, do_flip, color_scale = get_aug_config() else: scale, rot, do_flip, color_scale = 1.0, 0, False, [1.0, 1.0, 1.0] # 3. crop patch from img and perform data augmentation (flip, scale, rot, color scale) img_patch, trans = generate_patch_image(cvimg, bbox, do_flip, scale, rot) for i in range(img_channels): img_patch[:, :, i] = np.clip(img_patch[:, :, i] * color_scale[i], 0, 255) # 4. generate patch joint ground truth # flip joints and apply Affine Transform on joints if do_flip: joint_img[:, 0] = img_width - joint_img[:, 0] - 1 for pair in flip_pairs: joint_img[pair[0], :], joint_img[pair[1], :] = joint_img[ pair[1], :], joint_img[pair[0], :].copy() joint_vis[pair[0], :], joint_vis[pair[1], :] = joint_vis[ pair[1], :], joint_vis[pair[0], :].copy() for i in range(len(joint_img)): joint_img[i, 0:2] = trans_point2d(joint_img[i, 0:2], trans) joint_img[i, 2] /= ( cfg.bbox_3d_shape[0] / 2. * scale ) # expect depth lies in -bbox_3d_shape[0]/2 ~ bbox_3d_shape[0]/2 -> -1.0 ~ 1.0 joint_img[i, 2] = (joint_img[i, 2] + 1.0) / 2. # 0~1 normalize joint_vis[i] *= ( (joint_img[i,0] >= 0) & \ (joint_img[i,0] < cfg.input_shape[1]) & \ (joint_img[i,1] >= 0) & \ (joint_img[i,1] < cfg.input_shape[0]) & \ (joint_img[i,2] >= 0) & \ (joint_img[i,2] < 1) ) vis = False if vis: filename = str(random.randrange(1, 500)) tmpimg = img_patch.copy().astype(np.uint8) tmpkps = np.zeros((3, joint_num)) tmpkps[:2, :] = joint_img[:, :2].transpose(1, 0) tmpkps[2, :] = joint_vis[:, 0] tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) cv2.imwrite(osp.join(cfg.vis_dir, filename + '_gt.jpg'), tmpimg) vis = False if vis: vis_3d_skeleton(joint_img, joint_vis, skeleton, filename) # change coordinates to output space joint_img[:, 0] = joint_img[:, 0] / cfg.input_shape[1] * cfg.output_shape[1] joint_img[:, 1] = joint_img[:, 1] / cfg.input_shape[0] * cfg.output_shape[0] joint_img[:, 2] = joint_img[:, 2] * cfg.depth_dim # change joint coord, vis to reference dataset. 0th db is reference dataset if self.multiple_db: joint_img = transform_joint_to_other_db(joint_img, joints_name, ref_joints_name) joint_vis = transform_joint_to_other_db(joint_vis, joints_name, ref_joints_name) if self.is_train: img_patch = self.transform(img_patch) joint_img = joint_img.astype(np.float32) joint_vis = (joint_vis > 0).astype(np.float32) joints_have_depth = np.array([joints_have_depth ]).astype(np.float32) return img_patch, joint_img, joint_vis, joints_have_depth else: img_patch = self.transform(img_patch) return img_patch
def evaluate(self, preds): print() print('Evaluation start...') gts = self.datalist preds_joint_coord, preds_rel_root_depth, preds_hand_type, inv_trans = preds['joint_coord'], preds['rel_root_depth'], preds['hand_type'], preds['inv_trans'] assert len(gts) == len(preds_joint_coord) sample_num = len(gts) mpjpe = [[] for _ in range(self.joint_num)] # treat right and left hand identical acc_hand_cls = 0 for n in range(sample_num): data = gts[n] bbox, cam_param, joint, gt_hand_type = data['bbox'], data['cam_param'], data['joint'], data['hand_type'] focal = cam_param['focal'] princpt = cam_param['princpt'] gt_joint_coord = joint['cam_coord'] joint_valid = joint['valid'] # restore coordinates to original space pred_joint_coord_img = preds_joint_coord[n].copy() pred_joint_coord_img[:,0] = pred_joint_coord_img[:,0]/cfg.output_hm_shape[2]*cfg.input_img_shape[1] pred_joint_coord_img[:,1] = pred_joint_coord_img[:,1]/cfg.output_hm_shape[1]*cfg.input_img_shape[0] for j in range(self.joint_num*2): pred_joint_coord_img[j,:2] = trans_point2d(pred_joint_coord_img[j,:2],inv_trans[n]) pred_joint_coord_img[:,2] = (pred_joint_coord_img[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2) pred_joint_coord_img[:,2] = pred_joint_coord_img[:,2] + data['abs_depth'] # back project to camera coordinate system pred_joint_coord_cam = pixel2cam(pred_joint_coord_img, focal, princpt) # root joint alignment pred_joint_coord_cam[self.joint_type['right']] = pred_joint_coord_cam[self.joint_type['right']] - pred_joint_coord_cam[self.root_joint_idx['right'],None,:] pred_joint_coord_cam[self.joint_type['left']] = pred_joint_coord_cam[self.joint_type['left']] - pred_joint_coord_cam[self.root_joint_idx['left'],None,:] gt_joint_coord[self.joint_type['right']] = gt_joint_coord[self.joint_type['right']] - gt_joint_coord[self.root_joint_idx['right'],None,:] gt_joint_coord[self.joint_type['left']] = gt_joint_coord[self.joint_type['left']] - gt_joint_coord[self.root_joint_idx['left'],None,:] # select right or left hand using groundtruth hand type pred_joint_coord_cam = pred_joint_coord_cam[self.joint_type[gt_hand_type]] gt_joint_coord = gt_joint_coord[self.joint_type[gt_hand_type]] joint_valid = joint_valid[self.joint_type[gt_hand_type]] # mpjpe save for j in range(self.joint_num): if joint_valid[j]: mpjpe[j].append(np.sqrt(np.sum((pred_joint_coord_cam[j] - gt_joint_coord[j])**2))) if gt_hand_type == 'right' and preds_hand_type[n][0] > 0.5 and preds_hand_type[n][1] < 0.5: acc_hand_cls += 1 elif gt_hand_type == 'left' and preds_hand_type[n][0] < 0.5 and preds_hand_type[n][1] > 0.5: acc_hand_cls += 1 vis = False if vis: img_path = data['img_path'] cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) _img = cvimg[:,:,::-1].transpose(2,0,1) vis_kps = pred_joint_coord_img.copy() vis_valid = joint_valid.copy() filename = 'out_' + str(n) + '.jpg' vis_keypoints(_img, vis_kps, vis_valid, self.skeleton[:self.joint_num], filename) vis = False if vis: filename = 'out_' + str(n) + '_3d.png' vis_3d_keypoints(pred_joint_coord_cam, joint_valid, self.skeleton[:self.joint_num], filename) print('Handedness accuracy: ' + str(acc_hand_cls / sample_num)) eval_summary = 'MPJPE for each joint: \n' for j in range(self.joint_num): mpjpe[j] = np.mean(np.stack(mpjpe[j])) joint_name = self.skeleton[j]['name'] eval_summary += (joint_name + ': %.2f, ' % mpjpe[j]) print(eval_summary) print('MPJPE: %.2f' % (np.mean(mpjpe)))
def main(): video_dir = 'dance_videos\\Danny Ocean - Baby I Wont.mp4' beat_dir = video_dir.strip('mp4') + 'npy' interval = [32, 36] #in second REDU = True motion_base_dir = 'MyNao\\motion_base\\motion_base.json' if not os.path.exists(motion_base_dir): motion_base = {} with open(motion_base_dir, 'w') as f: json.dump(motion_base, f) with open(motion_base_dir, 'r') as f: motion_base = json.load(f) if REDU: pose_save_dir = 'MyNao\\motion_glance\\' + str(len(motion_base) - 1) else: pose_save_dir = 'MyNao\\motion_glance\\' + str(len(motion_base)) if not os.path.exists(pose_save_dir): os.mkdir(pose_save_dir) motion = {} motion['feature'] = {} motion['feature']['bps'] = [None] motion['feature']['symmetric'] = False motion['feature']['repeat'] = True motion['frame'] = {} #args = parse_args() #cfg.set_args(args.gpu_ids) cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True time_0 = time.time() tester = Tester(24) ##loading 3D pose estimation model tester._make_model() time_1 = time.time() print('loading integral pose model elapse:', round(time_1 - time_0, 2), 's') ##loading yolo detector detector = YOLOv3( model_def= "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg", class_path= "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names", weights_path= "3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights", classes=('person', ), max_batch_size=16, device=torch.device('cuda:{}'.format(cfg.gpu_ids[0]))) print('loading yolo elapse:', round(time.time() - time_1, 2), 's') skeleton = ((0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6)) fig = plt.figure(figsize=(10, 10)) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std) ]) ##load model if not os.path.exists(video_dir.strip('mp4') + 'wav'): videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio audioclip.write_audiofile(video_dir.strip('mp4') + 'wav') video = cv2.VideoCapture(video_dir) if not os.path.exists(beat_dir): time_2 = time.time() videoclip = VideoFileClip(video_dir) audioclip = videoclip.audio beat_activation = RNNBeatProcessor()(video_dir.strip('mp4') + 'wav') processor = DBNBeatTrackingProcessor(fps=100) beats = processor(beat_activation) frames_at_beat = (beats / audioclip.duration * video.get(cv2.CAP_PROP_FRAME_COUNT)).astype(int) print('extracting beat sequence elapse:', round(time.time() - time_2, 2), 's') np.save(beat_dir, frames_at_beat) frames_at_beat = np.load(beat_dir).tolist() for beat in frames_at_beat: if interval[0] * video.get(cv2.CAP_PROP_FPS) > beat: continue else: interval[0] = beat break for beat in frames_at_beat: if interval[1] * video.get(cv2.CAP_PROP_FPS) > beat: continue else: interval[1] = beat break video.set(1, interval[0]) frame = 0 next_beat = 0 last_beat = 0 num_beat = 0 num_frame_between_beats = [] with torch.no_grad(): while True: time_start = time.time() current_frame = video.get(cv2.CAP_PROP_POS_FRAMES) ret_val, raw_image = video.read() if current_frame == interval[1]: break input_img = raw_image.copy() ##using yolo to get human bounding box detections = detector.predict_single(input_img) # if not detections.cpu().numpy().all(): # detections = (0,0,input_img.shape[1],input_img.shape[0],1,1) # print('not detected') if detections is None: detections = np.array( [[0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1]]) print('not detected') elif detections.size()[0] == 0: detections = np.array( [[0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1]]) print('not detected') last_conf = 0 last_last_conf = 0 for i, (x1_pred, y1_pred, x2_pred, y2_pred, conf, cls_conf, cls_pred) in enumerate(detections): if conf.item() > last_conf: x1 = int(round(x1_pred.item())) - 40 x2 = int(round(x2_pred.item())) + 40 y1 = int(round(y1_pred.item())) - 20 y2 = int( round(y2_pred.item()) ) + 20 #for getting a larger bounding box to cover the full body, in order to get more accurate pose last_last_conf = last_conf last_conf = conf.item() print(last_conf, last_last_conf) if last_last_conf != 0: sys.exit() #print(x1, x2, y1, y2, last_conf) img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) input_patch = cv2.resize(img_patch, (cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch) print('Running model time:', round(time.time() - time_start, 2), 's') motion['frame'][frame] = {} if frame + interval[0] in frames_at_beat: motion['frame'][frame]['next_beat'] = 0 motion['frame'][frame]['last_beat'] = 0 #frames_at_beat.remove(frame) next_beat = frames_at_beat.index(frame + interval[0]) + 1 last_beat = frames_at_beat.index(frame + interval[0]) num_beat += 1 num_frame_between_beats.append(frames_at_beat[next_beat] - frames_at_beat[last_beat]) print('Record key frame with beat:', current_frame) else: motion['frame'][frame]['next_beat'] = frames_at_beat[ next_beat] - (frame + interval[0]) motion['frame'][frame]['last_beat'] = ( frame + interval[0]) - frames_at_beat[last_beat] coord_out = coord_out.cpu().numpy() coord_out_resize = coord_out * np.array([ img_patch.shape[1] / cfg.input_shape[1], img_patch.shape[0] / cfg.input_shape[0], 1 ]) for idx in range(coord_out_resize.shape[1] - 1): motion['frame'][frame][idx] = ( coord_out_resize[0][idx][0].item(), coord_out_resize[0][idx][2].item(), coord_out_resize[0][idx][1].item()) vis = True vis_3d = False if vis: tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape( 3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy() tmpkps = np.zeros((3, 18)) tmpkps[:2, :] = coord_out[0, :, :2].transpose( 1, 0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize(tmpimg, (img_patch.shape[1], img_patch.shape[0])) file_name = pose_save_dir + '\\{0}.png'.format( str(frame).zfill(4)) cv2.imwrite(file_name, tmpimg) if vis_3d: #coord_out = coord_out.cpu().numpy() #coord_out = coord_out * np.array([img_patch.shape[1]/cfg.input_shape[1], img_patch.shape[0]/cfg.input_shape[0], 1]) pred = coord_out_resize.squeeze( ) #remove first batch dimension ax = plt.subplot('121', projection='3d') plt.axis('off') show3D_pose(pred, ax, skeleton, radius=40) file_name = pose_save_dir + '\\{0}.png'.format( str(frame).zfill(4)) plt.savefig(file_name) # cv2.imwrite(file_name, tmpimg) frame += 1 print('Processing Frame:', round(time.time() - time_start, 2), 's') motion['feature']['fpb'] = np.mean(num_frame_between_beats) if REDU: motion_base[len(motion_base) - 1] = motion else: motion_base[len(motion_base)] = motion #with open(motion_base_dir, 'w') as f: # json.dump(motion_base, f) print('done with', num_beat + 1, 'beats! (This should be even for a normal dance)') print('num_frame between beats:') print(num_frame_between_beats)
def evaluate(self, preds): print() print('Evaluation start...') gts = self.datalist preds_joint_coord, preds_rel_root_depth, preds_hand_type, inv_trans = preds['joint_coord'], preds['rel_root_depth'], preds['hand_type'], preds['inv_trans'] assert len(gts) == len(preds_joint_coord) sample_num = len(gts) mpjpe_sh = [[] for _ in range(self.joint_num*2)] mpjpe_ih = [[] for _ in range(self.joint_num*2)] mrrpe = [] acc_hand_cls = 0; hand_cls_cnt = 0; for n in range(sample_num): data = gts[n] bbox, cam_param, joint, gt_hand_type, hand_type_valid = data['bbox'], data['cam_param'], data['joint'], data['hand_type'], data['hand_type_valid'] focal = cam_param['focal'] princpt = cam_param['princpt'] gt_joint_coord = joint['cam_coord'] joint_valid = joint['valid'] # restore xy coordinates to original image space pred_joint_coord_img = preds_joint_coord[n].copy() pred_joint_coord_img[:,0] = pred_joint_coord_img[:,0]/cfg.output_hm_shape[2]*cfg.input_img_shape[1] pred_joint_coord_img[:,1] = pred_joint_coord_img[:,1]/cfg.output_hm_shape[1]*cfg.input_img_shape[0] for j in range(self.joint_num*2): pred_joint_coord_img[j,:2] = trans_point2d(pred_joint_coord_img[j,:2],inv_trans[n]) # restore depth to original camera space pred_joint_coord_img[:,2] = (pred_joint_coord_img[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2) # mrrpe if gt_hand_type == 'interacting' and joint_valid[self.root_joint_idx['left']] and joint_valid[self.root_joint_idx['right']]: pred_rel_root_depth = (preds_rel_root_depth[n]/cfg.output_root_hm_shape * 2 - 1) * (cfg.bbox_3d_size_root/2) pred_left_root_img = pred_joint_coord_img[self.root_joint_idx['left']].copy() pred_left_root_img[2] += data['abs_depth']['right'] + pred_rel_root_depth pred_left_root_cam = pixel2cam(pred_left_root_img[None,:], focal, princpt)[0] pred_right_root_img = pred_joint_coord_img[self.root_joint_idx['right']].copy() pred_right_root_img[2] += data['abs_depth']['right'] pred_right_root_cam = pixel2cam(pred_right_root_img[None,:], focal, princpt)[0] pred_rel_root = pred_left_root_cam - pred_right_root_cam gt_rel_root = gt_joint_coord[self.root_joint_idx['left']] - gt_joint_coord[self.root_joint_idx['right']] mrrpe.append(float(np.sqrt(np.sum((pred_rel_root - gt_rel_root)**2)))) # add root joint depth pred_joint_coord_img[self.joint_type['right'],2] += data['abs_depth']['right'] pred_joint_coord_img[self.joint_type['left'],2] += data['abs_depth']['left'] # back project to camera coordinate system pred_joint_coord_cam = pixel2cam(pred_joint_coord_img, focal, princpt) # root joint alignment for h in ('right', 'left'): pred_joint_coord_cam[self.joint_type[h]] = pred_joint_coord_cam[self.joint_type[h]] - pred_joint_coord_cam[self.root_joint_idx[h],None,:] gt_joint_coord[self.joint_type[h]] = gt_joint_coord[self.joint_type[h]] - gt_joint_coord[self.root_joint_idx[h],None,:] # mpjpe for j in range(self.joint_num*2): if joint_valid[j]: if gt_hand_type == 'right' or gt_hand_type == 'left': mpjpe_sh[j].append(np.sqrt(np.sum((pred_joint_coord_cam[j] - gt_joint_coord[j])**2))) else: mpjpe_ih[j].append(np.sqrt(np.sum((pred_joint_coord_cam[j] - gt_joint_coord[j])**2))) # handedness accuray if hand_type_valid: if gt_hand_type == 'right' and preds_hand_type[n][0] > 0.5 and preds_hand_type[n][1] < 0.5: acc_hand_cls += 1 elif gt_hand_type == 'left' and preds_hand_type[n][0] < 0.5 and preds_hand_type[n][1] > 0.5: acc_hand_cls += 1 elif gt_hand_type == 'interacting' and preds_hand_type[n][0] > 0.5 and preds_hand_type[n][1] > 0.5: acc_hand_cls += 1 hand_cls_cnt += 1 vis = False if vis: img_path = data['img_path'] cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) _img = cvimg[:,:,::-1].transpose(2,0,1) vis_kps = pred_joint_coord_img.copy() vis_valid = joint_valid.copy() capture = str(data['capture']) cam = str(data['cam']) frame = str(data['frame']) filename = 'out_' + str(n) + '_' + gt_hand_type + '.jpg' vis_keypoints(_img, vis_kps, vis_valid, self.skeleton, filename) vis = False if vis: filename = 'out_' + str(n) + '_3d.jpg' vis_3d_keypoints(pred_joint_coord_cam, joint_valid, self.skeleton, filename) if hand_cls_cnt > 0: print('Handedness accuracy: ' + str(acc_hand_cls / hand_cls_cnt)) if len(mrrpe) > 0: print('MRRPE: ' + str(sum(mrrpe)/len(mrrpe))) print() tot_err = [] eval_summary = 'MPJPE for each joint: \n' for j in range(self.joint_num*2): tot_err_j = np.mean(np.concatenate((np.stack(mpjpe_sh[j]), np.stack(mpjpe_ih[j])))) joint_name = self.skeleton[j]['name'] eval_summary += (joint_name + ': %.2f, ' % tot_err_j) tot_err.append(tot_err_j) print(eval_summary) print('MPJPE for all hand sequences: %.2f' % (np.mean(tot_err))) print() eval_summary = 'MPJPE for each joint: \n' for j in range(self.joint_num*2): mpjpe_sh[j] = np.mean(np.stack(mpjpe_sh[j])) joint_name = self.skeleton[j]['name'] eval_summary += (joint_name + ': %.2f, ' % mpjpe_sh[j]) print(eval_summary) print('MPJPE for single hand sequences: %.2f' % (np.mean(mpjpe_sh))) print() eval_summary = 'MPJPE for each joint: \n' for j in range(self.joint_num*2): mpjpe_ih[j] = np.mean(np.stack(mpjpe_ih[j])) joint_name = self.skeleton[j]['name'] eval_summary += (joint_name + ': %.2f, ' % mpjpe_ih[j]) print(eval_summary) print('MPJPE for interacting hand sequences: %.2f' % (np.mean(mpjpe_ih)))
def main(): args = parse_args() cfg.set_args(args.gpu_ids) cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True tester = Tester(24) # The pretrained model used is assumed to be Human36M keypoints and skeleton. Change if a different model is used. tester.joint_num = 18 tester.skeleton = ( (0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6)) #tester._make_batch_generator() ##load 3D pose estimation model tester._make_model() ##loading yolo detector """detector = YOLOv3( model_def="/data1/cx/project/3DMPPE_POSENET_RELEASE/common/detectors/yolo/config/yolov3.cfg", class_path="/data1/cx/project/3DMPPE_POSENET_RELEASE/common/detectors/yolo/data/coco.names", weights_path="/data1/cx/project/3DMPPE_POSENET_RELEASE/common/detectors/yolo/weights/yolov3.weights", classes=('person',), max_batch_size=16, device=torch.device('cuda:{}'.format(cfg.gpu_ids[0])))""" preds = [] with torch.no_grad(): itr = 1 # iteration imgpath = 'D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\image\\test.jpg' # imgpath = '/data1/cx/project/3DMPPE_POSENET_RELEASE/image/test.jpg' bbox = (0, 0, 256, 256) input_img = prepare_input(imgpath, bbox) # forward coord_out = tester.model(input_img) print('coord out shape:', coord_out.shape) vis = True if vis: filename = str(itr) tmpimg = input_img[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape(3,1,1) + np.array(cfg.pixel_mean).reshape(3,1,1) tmpimg = tmpimg.astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg,(1,2,0)).copy() tmpkps = np.zeros((3,tester.joint_num)) tmpkps[:2,:] = coord_out[0,:,:2].cpu().numpy().transpose(1,0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2,:] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, tester.skeleton) cv2.imwrite(filename + '_output.jpg', tmpimg) # cv2.imshow('img', tmpimg) # cv2.waitKey(0) # cv2.destroyAllWindows() coord_out = coord_out.cpu().numpy() vis_3d=True if vis_3d: pred=coord_out.squeeze() #remove first batch dimension plot_3dkeypoints(pred, tester.skeleton) preds.append(coord_out) # evaluate preds = np.concatenate(preds, axis=0)
def main(): args = parse_args() cfg.set_args(args.gpu_ids) cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True time_0 = time.time() tester = Tester(24) ##loading 3D pose estimation model tester._make_model() time_1 = time.time() print('loading integral pose model elapse:', round(time_1 - time_0, 2), 's') ##loading yolo detector detector = YOLOv3( model_def= "D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\config\\yolov3.cfg", class_path= "D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\data\\coco.names", weights_path= "D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\common\\detectors\\yolo\\weights\\yolov3.weights", classes=('person', ), max_batch_size=16, device=torch.device('cuda:{}'.format(cfg.gpu_ids[0]))) print('loading yolo elapse:', round(time.time() - time_1, 2), 's') skeleton = ((0, 7), (7, 8), (8, 9), (9, 10), (8, 11), (11, 12), (12, 13), (8, 14), (14, 15), (15, 16), (0, 1), (1, 2), (2, 3), (0, 4), (4, 5), (5, 6)) fig = plt.figure(figsize=(10, 10)) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std) ]) ##load model preds = [] if args.demo == 'image': image_path = 'D:\\Robot Dance\\AI-Project-Portfolio\\3DMPPE_POSENET_RELEASE\\image\\test_sample_20200711.jpg' raw_image = cv2.imread( image_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) input_img = raw_image.copy() ##using yolo to get human bounding box detections = detector.predict_single(input_img) # if not detections.cpu().numpy(): # detections = (0,0,input_img.shape[1],input_img.shape[0],1,1) for i, (x1, y1, x2, y2, conf, cls_conf, cls_pred) in enumerate(detections): x1 = int(round(x1.item())) x2 = int(round(x2.item())) y1 = int(round(y1.item())) y2 = int(round(y2.item())) img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) input_patch = cv2.resize(img_patch, (cfg.input_shape)) with torch.no_grad(): # forward input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch) vis = True itr = 3 if vis: filename = str(itr) tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape( 3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy() tmpkps = np.zeros((3, 18)) tmpkps[:2, :] = coord_out[0, :, :2].cpu().numpy().transpose( 1, 0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize(tmpimg, (img_patch.shape[1], img_patch.shape[0])) cv2.imwrite(filename + '_output.jpg', tmpimg) # print(tmpkps) coord_out = coord_out.cpu().numpy() coord_out = coord_out * np.array([ img_patch.shape[1] / cfg.input_shape[1], img_patch.shape[0] / cfg.input_shape[0], 1 ]) #coord_out = (coord_out - np.mean(coord_out, axis = 1))/np.std(coord_out, axis = 1) coord_out = coord_out[:, :-1, :] print(coord_out) print(coord_out.shape) np.save('coord_out_0711.npy', coord_out) vis_3d = True if vis_3d: pred = coord_out.squeeze() #remove first batch dimension plot_3dkeypoints(pred, skeleton) motion_frame = {} for idx in range(coord_out.shape[1] - 1): motion_frame[idx] = (coord_out[0][idx][0].item(), coord_out[0][idx][2].item(), coord_out[0][idx][1].item()) with open('motionFrame.json', 'w') as f: json.dump(motion_frame, f) elif args.demo == 'video': video = cv2.VideoCapture( 'D:\\Robot Dance\\AI-Project-Portfolio\\dance_videos\\test_sample.mp4' ) ret_val, image = video.read() motion = {} frame = 0 with torch.no_grad(): while True: time_start = time.time() ret_val, raw_image = video.read() if not ret_val: break input_img = raw_image.copy() ##using yolo to get human bounding box detections = detector.predict_single(input_img) # if not detections.cpu().numpy().all(): # detections = (0,0,input_img.shape[1],input_img.shape[0],1,1) # print('not detected') if detections is None: detections = np.array([[ 0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1 ]]) print('not detected') elif detections.size()[0] == 0: detections = np.array([[ 0, 0, input_img.shape[1], input_img.shape[0], 1, 1, 1 ]]) print('not detected') for i, (x1, y1, x2, y2, conf, cls_conf, cls_pred) in enumerate(detections): x1 = int(round(x1.item())) x2 = int(round(x2.item())) y1 = int(round(y1.item())) y2 = int(round(y2.item())) img_patch = (input_img[y1:y2, x1:x2, ::-1]).copy().astype(np.float32) input_patch = cv2.resize(img_patch, (cfg.input_shape)) input_patch = transform(input_patch).unsqueeze(0) coord_out = tester.model(input_patch) print('Running model time:', round(time.time() - time_start, 2), 's') motion[frame] = {} for idx in range(coord_out.shape[1] - 1): motion[frame][idx] = (coord_out[0][idx][0].item(), coord_out[0][idx][2].item(), coord_out[0][idx][1].item()) vis = True vis_3d = False if vis: tmpimg = input_patch[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape( 3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) tmpimg = (tmpimg).astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy() tmpkps = np.zeros((3, 18)) tmpkps[:2, :] = coord_out[ 0, :, :2].cpu().numpy().transpose( 1, 0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, skeleton) tmpimg = cv2.resize( tmpimg, (img_patch.shape[1], img_patch.shape[0])) file_name = '../demo_result/{0}.png'.format( str(frame).zfill(4)) cv2.imwrite(file_name, tmpimg) frame += 1 if vis_3d: coord_out = coord_out.cpu().numpy() coord_out = coord_out * np.array([ img_patch.shape[1] / cfg.input_shape[1], img_patch.shape[0] / cfg.input_shape[0], 1 ]) pred = coord_out.squeeze() #remove first batch dimension ax = plt.subplot('121', projection='3d') plt.axis('off') show3D_pose(pred, ax, skeleton, radius=40) # plot_3dkeypoints(pred,skeleton) file_name = '../test_sample_result/{0}.png'.format( str(frame).zfill(4)) plt.savefig(file_name) # cv2.imwrite(file_name, tmpimg) frame += 1 print('Processing Frame:', round(time.time() - time_start, 2), 's') with open('motionTest.json', 'w') as f: json.dump(motion, f)
def main(): args = parse_args() cfg.set_args(args.gpu_ids) cudnn.fastest = True cudnn.benchmark = True cudnn.deterministic = False cudnn.enabled = True tester = Tester(args.test_epoch) tester._make_batch_generator() tester._make_model() preds = [] tmpp = 0 with torch.no_grad(): for itr, input_img in enumerate(tqdm(tester.batch_generator)): # forward coord_out = tester.model(input_img) if cfg.flip_test: flipped_input_img = flip(input_img, dims=3) flipped_coord_out = tester.model(flipped_input_img) flipped_coord_out[:, :, 0] = cfg.output_shape[ 1] - flipped_coord_out[:, :, 0] - 1 for pair in tester.flip_pairs: flipped_coord_out[:, pair[0], :], flipped_coord_out[:, pair[ 1], :] = flipped_coord_out[:, pair[1], :].clone( ), flipped_coord_out[:, pair[0], :].clone() coord_out = (coord_out + flipped_coord_out) / 2. vis = True if vis: filename = str(itr) tmpimg = input_img[0].cpu().numpy() tmpimg = tmpimg * np.array(cfg.pixel_std).reshape( 3, 1, 1) + np.array(cfg.pixel_mean).reshape(3, 1, 1) tmpimg = tmpimg.astype(np.uint8) tmpimg = tmpimg[::-1, :, :] tmpimg = np.transpose(tmpimg, (1, 2, 0)).copy() tmpkps = np.zeros((3, tester.joint_num)) tmpkps[:2, :] = coord_out[0, :, :2].cpu().numpy().transpose( 1, 0) / cfg.output_shape[0] * cfg.input_shape[0] tmpkps[2, :] = 1 tmpimg = vis_keypoints(tmpimg, tmpkps, tester.skeleton) # print(tmpkps) cv2.imwrite(filename + '_output.jpg', tmpimg) coord_out = coord_out.cpu().numpy() preds.append(coord_out) tmpp += 1 if tmpp == 5: break # evaluate preds = np.concatenate(preds, axis=0) # print(preds.shape) # It takes testest from common/base.py # It then calls evaluate of that dataset # from data/MSCOCO/MSCOCO.py call visualise keypoints # from common/utils/vis.py the plts are plotted tester._evaluate(preds, cfg.result_dir)