def predict(self, imgs, camera_parameter, template_name='Shelf', show=False, plt_id=0): #prima data se salveaza in info_dict predictiile 2D #imgs e o lista cu 3 tensori = 3 poze din 3 puncte de vedere diferite #un batch e o poza info_dict = self._infer_single2d ( imgs ) #info_dict e un dictionar cu key-uri pe camere, fiecare key avand un dictionar format din mai multe dictionare, asemanator cu un JSON #asa cum zice si la descriere, mem dataset pune datasetul 2D in memorie self.dataset = MemDataset ( info_dict=info_dict, camera_parameter=camera_parameter, template_name=template_name ) #acuma incepe distractia return self._estimate3d ( 0, show=show, plt_id=plt_id )
def predict(self, imgs, camera_parameter, template_name='Shelf', show=False, plt_id=0): info_dict = self._infer_single2d(imgs) self.dataset = MemDataset(info_dict=info_dict, camera_parameter=camera_parameter, template_name=template_name) return self._estimate3d(0, show=show, plt_id=plt_id)
def export(model, loader, is_info_dicts=False, show=False): pose_list = list() for img_id, imgs in enumerate(tqdm(loader)): try: pass except Exception as e: pass # poses3d = model.estimate3d ( img_id=img_id, show=False ) #if-ul asta e pe true daca se foloseste treaba aia cu preprocesare if is_info_dicts: #practc in info_dicts is imaginile, si are si heatmeaps si cropped images pt fiecare persoana, salvate din preprocesare info_dicts = numpify(imgs) model.dataset = MemDataset(info_dict=info_dicts, camera_parameter=camera_parameter, template_name='Unified') #nu inteleg de ce aici foloseste estimate si jos predict-is retardat #foloseste aici estimate pentru ca alea 2D sunt deja salvate, asa ca trebuie doar estimate alea 3D. la predict trebuie facut tot. poses3d = model._estimate3d(0, show=show) else: this_imgs = list() #imgs e o lista cu 3 tensori. #un img_batch e un tensor din acestia 3, si are dimensiunea (288,360,3) si mai trebuie sa aflu de unde vine for img_batch in imgs: #in this_imgs se aduna acelasi lucru ca in imgs, dar fara prima lista care are dimensiunea [1], asta face squeeze this_imgs.append(img_batch.squeeze().numpy()) poses3d = model.predict(imgs=this_imgs, camera_parameter=camera_parameter, template_name='Unified', show=show, plt_id=img_id) pose_list.append(poses3d) return pose_list
def export(model, loader, is_info_dicts=False, show=False): pose_list = list() for img_id, imgs in enumerate(tqdm(loader)): try: pass except Exception as e: pass # poses3d = model.estimate3d ( img_id=img_id, show=False ) if is_info_dicts: info_dicts = numpify(imgs) model.dataset = MemDataset(info_dict=info_dicts, camera_parameter=camera_parameter, template_name='Unified') poses3d = model._estimate3d(0, show=show) else: this_imgs = list() for img_batch in imgs: this_imgs.append(img_batch.squeeze().numpy()) poses3d = model.predict(imgs=this_imgs, camera_parameter=camera_parameter, template_name='Unified', show=show, plt_id=img_id) pose_list.append(poses3d) return pose_list
def export(model, loader, is_info_dicts=False, show=False): pose_list = list() for img_id, imgs in enumerate(tqdm(loader)): try: pass except Exception as e: pass # poses3d = model.estimate3d ( img_id=img_id, show=False ) if is_info_dicts: info_dicts = numpify(imgs) model.dataset = MemDataset(info_dict=info_dicts, camera_parameter=camera_parameter, template_name='Unified') poses3d = model._estimate3d(0, show=show) else: this_imgs = list() # undistort here for iimg, img_batch in enumerate(imgs): frame = img_batch.squeeze().numpy() if 'distCoef' in camera_parameter.keys(): mtx = camera_parameter['K'][iimg] dist = camera_parameter['distCoef'][iimg] frame = cv2.undistort(frame, mtx, dist, None) this_imgs.append(frame) poses3d = model.predict(imgs=this_imgs, camera_parameter=camera_parameter, template_name='Unified', show=show, plt_id=img_id) pose_list.append(poses3d) return pose_list
def export(model, loader, is_info_dicts=False, show=False): pose3d_list, pose2d_list, hand_bbox_list, hand_pose_list = list(), list( ), list(), list() print('num of frames: {}'.format(len(loader))) with torch.no_grad(): t_start = time.time() for img_id, imgs in enumerate(tqdm(loader)): if is_info_dicts: info_dicts = numpify(imgs) model.dataset = MemDataset(info_dict=info_dicts, camera_parameter=camera_parameter, template_name='Unified') poses3d = model._estimate3d(0, show=show) else: # √ this_imgs = list() # len(this_imgs) = 3 for img_batch in imgs: this_imgs.append(img_batch.squeeze().numpy() ) # this_imgs[0].shape = (288, 360, 3) poses3d, pose2d, hand_bbox, hand_pose = model.predict( imgs=this_imgs, camera_parameter=camera_parameter, template_name='Unified', show=show, plt_id=img_id) pose3d_list.append(poses3d) pose2d_list.append(pose2d) hand_bbox_list.append(hand_bbox) hand_pose_list.append(hand_pose) t_end = time.time() print('overall avg time: {}'.format((t_end - t_start) / len(loader))) return pose3d_list, pose2d_list, hand_bbox_list, hand_pose_list
def predict(self, imgs, camera_parameter, template_name='Shelf', show=False, plt_id=0): # 2d human pose estimation for all views info_dict = self._infer_single2d ( imgs ) self.dataset = MemDataset ( info_dict=info_dict, camera_parameter=camera_parameter, template_name=template_name ) # 3d huamn pose reconstruction pose3d = self._estimate3d ( 0, show=show, plt_id=plt_id ) # hand bbox detection for all views pose2d, hand_bbox = self._detect_hand(imgs, info_dict) # 3d hand pose estimation cam_id = 1 hand_pose = self._estimate_hand_pose(imgs, hand_bbox, camera_parameter, cam_id) return pose3d, pose2d, hand_bbox, hand_pose
def export(info_dicts, model, match, img_names): pose3d_list = list() N = len(info_dicts) print('num of frames: {}'.format(N)) with torch.no_grad(): t_start = time.time() for i, info in enumerate(info_dicts): if i % 1000 == 0: print('processing ({:5d}/{:5d})'.format(i, N)) # import other info ('image_data', 'cropped_img') info_dict = dict() for cam_id in range(len(info)): idx = match['idx{}'.format(cam_id + 1)][i] img_name = img_names[cam_id][idx] assert idx == int(os.path.basename(img_name).split('_')[1]) img = cv2.imread(img_name) info[cam_id]['image_data'] = cv2.cvtColor( img.copy(), cv2.COLOR_BGR2RGB) for person_id in range(len(info[cam_id][0])): bb = np.array(info[cam_id][0][person_id]['bbox'], dtype=int) cropped_img = img[bb[1]:bb[3], bb[0]:bb[2]] info[cam_id][0][person_id]['cropped_img'] = cv2.cvtColor( cropped_img.copy(), cv2.COLOR_BGR2RGB) info[cam_id][0][person_id]['pose2d'] = info[cam_id][0][ person_id]['pose2d'][:17].reshape(-1) info[cam_id][0][person_id]['heatmap_data'] = [] info[cam_id][0][person_id]['heatmap_bbox'] = [] info_dict[cam_id] = info[cam_id] satisfied = (np.array([len(vinfo[0]) for vinfo in info]) > 0).sum() if satisfied > 1: model.dataset = MemDataset(info_dict=info_dict, camera_parameter=camera_parameter, template_name='Unified') poses3d = model._estimate3d(0) else: pose3d = -1 pose3d_list.append(poses3d) info_dicts[i] = -1 t_end = time.time() print('overall avg time: {}'.format((t_end - t_start) / N)) return pose3d_list
class MultiEstimator ( object ): def __init__(self, cfg, debug=False): self.est2d = Estimator_2d ( DEBUGGING=debug ) #asta daca se foloseste CamStyle #self.extractor = FeatureExtractor () #asta daca se foloseste OSNet self.extractor = FeatureExtractor( model_name='osnet_x1_0', model_path='backend/reid/models/osnet_x1_0_market_256x128_amsgrad_ep150_stp60_lr0.0015_b64_fb10_softmax_labelsmooth_flip.pth', device='cuda' ) self.cfg = cfg self.dataset = None #in functia asta practic se fac toate chestiile def predict(self, imgs, camera_parameter, template_name='Shelf', show=False, plt_id=0): #prima data se salveaza in info_dict predictiile 2D #imgs e o lista cu 3 tensori = 3 poze din 3 puncte de vedere diferite #un batch e o poza info_dict = self._infer_single2d ( imgs ) #info_dict e un dictionar cu key-uri pe camere, fiecare key avand un dictionar format din mai multe dictionare, asemanator cu un JSON #asa cum zice si la descriere, mem dataset pune datasetul 2D in memorie self.dataset = MemDataset ( info_dict=info_dict, camera_parameter=camera_parameter, template_name=template_name ) #acuma incepe distractia return self._estimate3d ( 0, show=show, plt_id=plt_id ) def _infer_single2d(self, imgs, img_id=0, dir='/home/jiangwen/tmp/Multi'): info_dict = dict () #o ia pe batchuri, trebuie sa vad cat e imgs aici, dar cred ca e 3, ceea ce e putin dubios ca ar insemna ca is doar 3 imagini #am banuiala ca e 3 deoarece sunt 3 camere, si atunci proceseaza cate un frame pe rand, din 3 views de o data, dar nu stiu unde e for-ul care merge la urmatorul frame #e fals ce am zis mai sus. se ia pe cate o imagine de o data. In imgs sunt 3 imagini, si intr-un batch e o imagine. are dimensiunea 3 la capat fiidca ala e numarul de channeluri de la poze for cam_id, img in enumerate ( imgs ): #trimte la estimare un batch(3 imagini). pentru a confirma asta voi merge in ubuntu, revin. #e fals, trimte o poza din cele 3 dintr-un batch. are ultima dimensiune 3 fiindca ala e numarul de channeluri results = self.est2d.estimate_2d ( img, img_id ) #in results e o imagine plus bounding boxes unde sunt si scheletele 2D #in this info dict, la key-ul 'image_data' pune poza colorata rgb this_info_dict = {'image_data': cv2.cvtColor ( img.copy (), cv2.COLOR_BGR2RGB )} #la key-ul 'img_id' face o lista this_info_dict[img_id] = list () for person_id, result in enumerate ( results ): #in lista de img_id o sa avem un alt dictionar this_info_dict[img_id].append ( dict () ) this_info_dict[img_id][person_id]['pose2d'] = result['keypoints'] # NOTE: bbox is (x, y) (W, H) format where x and y is up-left point. this_info_dict[img_id][person_id]['bbox'] = result['bbox'] bb = np.array ( result['bbox'], dtype=int ) #practic creeaza persoane din rezultate unde sunt bounding boxes cropped_img = img[bb[1]:bb[1] + bb[3], bb[0]:bb[0] + bb[2]] # numpy format of crop idx is changed to json #as vrea sa vad ce e in result. pana acum sigur sunt keyppoints, bbox, crop si heatmaps this_info_dict[img_id][person_id]['heatmap_bbox'] = result['crops'].astype ( int ).tolist () this_info_dict[img_id][person_id]['heatmap_data'] = result['heatmaps'] this_info_dict[img_id][person_id]['cropped_img'] = cv2.cvtColor ( cropped_img.copy (), cv2.COLOR_BGR2RGB ) #e tot asa pe camere, se infereaza pe camere separat poza 2D info_dict[cam_id] = this_info_dict return info_dict def _estimate3d(self, img_id, show=False, plt_id=0): #banuiesc doar ca in dataset[img_id] sunt 3 imagini de la cele 3 camere, sau o singura imagine, will never find out #din ce zice in CamStyle, in dataset[img_id] se afla un object iterabil cu imagini, nume, pids, cam_id, deci prima presupunere de mai sus e adevarata, bravo mie data_batch = self.dataset[img_id] #merg sa vad cum se calculeaza matricea de afinitate. cfg rerank e False affinity_mat = self.extractor.get_affinity ( data_batch, rerank=self.cfg.rerank ) if self.cfg.rerank: affinity_mat = torch.from_numpy ( affinity_mat ) affinity_mat = torch.max ( affinity_mat, affinity_mat.t () ) affinity_mat = 1 - affinity_mat else: affinity_mat = affinity_mat.cpu () #cate persoane sunt in grupul de imagini dimGroup = self.dataset.dimGroup[img_id] info_list = list () for cam_id in self.dataset.cam_names: #o lista la care se concateneaza cele 3 imagini de la cele 3 camere info_list += self.dataset.info_dict[cam_id][img_id] #o matrice de pose-uri reshapeuite dupa numarul de jointuri pe care il accepta detectorul 3D pose_mat = np.array ( [i['pose2d'] for i in info_list] ).reshape ( -1, model_cfg.joint_num, 3 )[..., :2] #nu stiu ce e in self.dataset.F dar e un calcul de parametrii camerelor #se trimit detectiile 2D, tensorul ala cu parametrii camerelor despre care trebuie neaparat sa citesc si numarul de persoane din cele 3 imagini, ca practic is detectii 2D pt toate persoanele astea geo_affinity_mat = geometry_affinity ( pose_mat.copy (), self.dataset.F.numpy (), self.dataset.dimGroup[img_id] ) geo_affinity_mat = torch.tensor ( geo_affinity_mat ) #metrica chiar e geometry mean if self.cfg.metric == 'geometry mean': W = torch.sqrt ( affinity_mat * geo_affinity_mat ) elif self.cfg.metric == 'circle': W = torch.sqrt ( (affinity_mat ** 2 + geo_affinity_mat ** 2) / 2 ) elif self.cfg.metric == 'Geometry only': W = torch.tensor ( geo_affinity_mat ) elif self.cfg.metric == 'ReID only': W = torch.tensor ( affinity_mat ) else: logger.critical ( 'Get into default option, are you intend to do it?' ) _alpha = 0.8 W = 0.8 * affinity_mat + (1 - _alpha) * geo_affinity_mat W[torch.isnan ( W )] = 0 # Some times (Shelf 452th img eg.) torch.sqrt will return nan if its too small sub_imgid2cam = np.zeros ( pose_mat.shape[0], dtype=np.int32 ) for idx, i in enumerate ( range ( len ( dimGroup ) - 1 ) ): sub_imgid2cam[dimGroup[i]:dimGroup[i + 1]] = idx #de aici incep chestiile complexe num_person = 10 #x0 e un tensor cu valori random. Nu stiu cat e W.shape[0] but i would like to find out. X0 = torch.rand ( W.shape[0], num_person ) # Use spectral method to initialize assignment matrix. #spectral method e o metoda de a rezolva ecuatiile diferentiale de care nu am auzit niciodata #nu stiu nici asigment matrix ce e if self.cfg.spectral: #un eigen vector e un vector unitate aparent, adica nu isi schimba directia cand o transformare liniara este aplicata eig_value, eig_vector = W.eig ( eigenvectors=True ) _, eig_idx = torch.sort ( eig_value[:, 0], descending=True ) if W.shape[1] >= num_person: X0 = eig_vector[eig_idx[:num_person]].t () else: X0[:, :W.shape[1]] = eig_vector.t () #merg in matchSVT sa vad ce se intampla #match_mat e matricea de permutare unde sunt marcate perechile de oameni care se matchuie match_mat = matchSVT ( W, dimGroup, alpha=self.cfg.alpha_SVT, _lambda=self.cfg.lambda_SVT, dual_stochastic_SVT=self.cfg.dual_stochastic_SVT ) #habar n-am ce face aici,posibil sa elimine detectiile false din prima matrice bin_match = match_mat[:, torch.nonzero ( torch.sum ( match_mat, dim=0 ) > 1.9 ).squeeze ()] > 0.9 bin_match = bin_match.reshape ( W.shape[0], -1 ) #cate matchuri sunt atatia vectori goli se pun in matched list matched_list = [[] for i in range ( bin_match.shape[1] )] for sub_imgid, row in enumerate ( bin_match ): if row.sum () != 0: pid = row.argmax () #se adauga id-ul imaginii la pid-ul imaginii care are suma randului diferita de 0, deci practic exista un match matched_list[pid].append ( sub_imgid ) #practic face ca toti vectorii sa fie de tipul numpy.array() matched_list = [np.array ( i ) for i in matched_list] #hybrid is true if self.cfg.hybrid: multi_pose3d = self._hybrid_kernel ( matched_list, pose_mat, sub_imgid2cam, img_id ) chosen_img = [[]] * len ( sub_imgid2cam ) else: multi_pose3d, chosen_img = self._top_down_pose_kernel ( geo_affinity_mat, matched_list, pose_mat, sub_imgid2cam ) if show: # hybrid not implemented yet. bin_match = match_mat[:, torch.nonzero ( torch.sum ( match_mat, dim=0 ) > 0.9 ).squeeze ()] > 0.9 bin_match = bin_match.reshape ( W.shape[0], -1 ) matched_list = [[] for i in range ( bin_match.shape[1] )] for sub_imgid, row in enumerate ( bin_match ): if row.sum () != 0: pid = row.argmax () matched_list[pid].append ( sub_imgid ) matched_list = [np.array ( i ) for i in matched_list] #astea 2 de jos is de rezultate vizuale show_panel_mem ( self.dataset, matched_list, info_list, sub_imgid2cam, img_id, affinity_mat, geo_affinity_mat, W, plt_id, multi_pose3d ) plotPaperRows ( self.dataset, matched_list, info_list, sub_imgid2cam, img_id, affinity_mat, geo_affinity_mat, W, plt_id, multi_pose3d ) return multi_pose3d def _hybrid_kernel(self, matched_list, pose_mat, sub_imgid2cam, img_id): return pictorial.hybrid_kernel ( self, matched_list, pose_mat, sub_imgid2cam, img_id ) multi_pose3d = list () for person in matched_list: # use bottom-up approach to get the 3D pose of person if person.shape[0] <= 1: continue # step1: use the 2D joint of person to triangulate the 3D joints candidates # person's 17 3D joints candidates candidates = np.zeros ( (17, person.shape[0] * (person.shape[0] - 1) // 2, 3) ) # 17xC^2_nx3 cnt = 0 for i in range ( person.shape[0] ): for j in range ( i + 1, person.shape[0] ): cam_id_i, cam_id_j = sub_imgid2cam[person[i]], sub_imgid2cam[person[j]] projmat_i, projmat_j = self.dataset.P[cam_id_i], self.dataset.P[cam_id_j] pose2d_i, pose2d_j = pose_mat[person[i]].T, pose_mat[person[j]].T pose3d_homo = cv2.triangulatePoints ( projmat_i, projmat_j, pose2d_i, pose2d_j ) pose3d_ij = pose3d_homo[:3] / pose3d_homo[3] candidates[:, cnt] += pose3d_ij.T cnt += 1 unary = self.dataset.get_unary ( person, sub_imgid2cam, candidates, img_id ) # step2: use the max-product algorithm to inference to get the 3d joint of the person # change the coco order coco_2_skel = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] candidates = np.array ( candidates )[coco_2_skel] unary = unary[coco_2_skel] skel = pictorial.getskel () # construct pictorial model edges = pictorial.getPictoStruct ( skel, self.dataset.distribution ) xp = pictorial.inferPict3D_MaxProd ( unary, edges, candidates ) human = np.array ( [candidates[i][j] for i, j in zip ( range ( xp.shape[0] ), xp )] ) human_coco = np.zeros ( (17, 3) ) human_coco[[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] = human human_coco[[1, 2, 3, 4]] = human_coco[0] # Just make visualize beauty not real ear and eye human_coco = human_coco.T if self.cfg.reprojection_refine and len ( person ) > 2: for joint_idx in range ( human_coco.shape[1] ): reprojected_error = np.zeros ( len ( person ) ) for idx, pid in enumerate ( person ): human_coco_homo = np.ones ( 4 ) human_coco_homo[:3] = human_coco[:, joint_idx] projected_pose_homo = self.dataset.P[sub_imgid2cam[pid]] @ human_coco_homo projected_pose = projected_pose_homo[:2] / projected_pose_homo[2] reprojected_error[idx] += np.linalg.norm ( projected_pose - pose_mat[pid, joint_idx] ) # import IPython; IPython.embed() # pose_select = reprojected_error < self.cfg.refine_threshold pose_select = ( reprojected_error - reprojected_error.mean ()) / reprojected_error.std () < self.cfg.refine_threshold if pose_select.sum () >= 2: Ps = list () Ys = list () for idx, is_selected in enumerate ( pose_select ): if is_selected: Ps.append ( self.dataset.P[sub_imgid2cam[person[idx]]] ) Ys.append ( pose_mat[person[idx], joint_idx].reshape ( -1, 1 ) ) Ps = torch.tensor ( Ps, dtype=torch.float32 ) Ys = torch.tensor ( Ys, dtype=torch.float32 ) Xs = multiTriIter ( Ps, Ys ) refined_pose = (Xs[:3] / Xs[3]).numpy () human_coco[:, joint_idx] = refined_pose.reshape ( -1 ) if True or check_bone_length ( human_coco ): multi_pose3d.append ( human_coco ) return multi_pose3d def _top_down_pose_kernel(self, geo_affinity_mat, matched_list, pose_mat, sub_imgid2cam): multi_pose3d = list () chosen_img = list () for person in matched_list: Graph = geo_affinity_mat[person][:, person].clone ().numpy () Graph *= (1 - np.eye ( Graph.shape[0] )) # make diagonal 0 if len ( Graph ) < 2: continue elif len ( Graph ) > 2: if self.cfg.use_mincut: cut0, cut1 = find_mincut ( Graph.copy () ) cut = cut0 if len ( cut0 ) > len ( cut1 ) else cut1 cut = cut.astype ( int ) sub_imageid = person[cut] else: sub_imageid = get_min_reprojection_error ( person, self.dataset, pose_mat, sub_imgid2cam ) else: sub_imageid = person _, rank = torch.sort ( geo_affinity_mat[sub_imageid][:, sub_imageid].sum ( dim=0 ) ) sub_imageid = sub_imageid[rank[:2]] cam_id_0, cam_id_1 = sub_imgid2cam[sub_imageid[0]], sub_imgid2cam[sub_imageid[1]] projmat_0, projmat_1 = self.dataset.P[cam_id_0], self.dataset.P[cam_id_1] pose2d_0, pose2d_1 = pose_mat[sub_imageid[0]].T, pose_mat[sub_imageid[1]].T pose3d_homo = cv2.triangulatePoints ( projmat_0, projmat_1, pose2d_0, pose2d_1 ) if self.cfg.use_bundle: pose3d_homo = bundle_adjustment ( pose3d_homo, person, self.dataset, pose_mat, sub_imgid2cam, logging=logger ) pose3d = pose3d_homo[:3] / (pose3d_homo[3] + 10e-6) # pose3d -= ((pose3d[:, 11] + pose3d[:, 12]) / 2).reshape ( 3, -1 ) # No need to normalize to hip if check_bone_length ( pose3d ): multi_pose3d.append ( pose3d ) else: # logging.info ( f'A pose proposal deleted on {img_id}:{person}' ) sub_imageid = list () pass chosen_img.append ( sub_imageid ) return multi_pose3d, chosen_img
class MultiEstimator ( object ): def __init__(self, cfg, debug=False): self.est2d = Estimator_2d ( DEBUGGING=debug ) self.extractor = FeatureExtractor () self.cfg = cfg self.dataset = None # hand bbox self.yolo = YOLO("/home/yxs/lit/mvpose/backend/yolo_hand_detection/models/cross-hands.cfg", \ "/home/yxs/lit/mvpose/backend/yolo_hand_detection/models/cross-hands.weights", ["hand"]) self.sample_range = 120 self.expand_rate = 1.5 # hand pose self.tester = Tester('49') self.tester._make_batch_generator('test', 'all') self.tester._make_model() # coord transformation self.transform = lambda R, P: (R.t() @ P.t()).t() def expand_bbox(self, b, x_max, y_max): xl, xr = np.clip(int(b[0]-b[2]*(self.expand_rate-1)/2.0), 0, x_max), np.clip(int(b[0]+b[2]+b[2]*(self.expand_rate-1)/2.0), 0, x_max) yt, yb = np.clip(int(b[1]-b[3]*(self.expand_rate-1)/2.0), 0, y_max), np.clip(int(b[1]+b[3]+b[3]*(self.expand_rate-1)/2.0), 0, y_max) expanded_b = [xl,yt,xr-xl,yb-yt] return expanded_b def predict(self, imgs, camera_parameter, template_name='Shelf', show=False, plt_id=0): # 2d human pose estimation for all views info_dict = self._infer_single2d ( imgs ) self.dataset = MemDataset ( info_dict=info_dict, camera_parameter=camera_parameter, template_name=template_name ) # 3d huamn pose reconstruction pose3d = self._estimate3d ( 0, show=show, plt_id=plt_id ) # hand bbox detection for all views pose2d, hand_bbox = self._detect_hand(imgs, info_dict) # 3d hand pose estimation cam_id = 1 hand_pose = self._estimate_hand_pose(imgs, hand_bbox, camera_parameter, cam_id) return pose3d, pose2d, hand_bbox, hand_pose def _detect_hand(self, imgs, info_dict): pose2d, hand_bbox = [], [] for cam_id, img in enumerate(imgs): # 遍历每个视角 pose_tmp, hand_tmp = [], [] for res in info_dict[cam_id][0]: # 遍历每个人 pose_tmp.append(res['pose2d']) for joint_id in [9,10]: # get rough bbox cx1, cy1 = res['pose2d'][joint_id*3], res['pose2d'][joint_id*3+1] # 左/右手关节位置 max_x1, max_y1 = img.shape[1] - 1, img.shape[0] - 1 xl1, xr1 = np.clip(int(cx1-self.sample_range), 0, max_x1), np.clip(int(cx1+self.sample_range), 0, max_x1) # rough bbox的x方向范围 yt1, yb1 = np.clip(int(cy1-self.sample_range), 0, max_y1), np.clip(int(cy1+self.sample_range), 0, max_y1) # rough bbox的y方向范围 cropped_img = img[yt1:yb1, xl1:xr1, :] # get accurate bbox cnt, conf, bbox = yolo_hand_detection(self.yolo, cropped_img) bbox = [[b[0]+xl1, b[1]+yt1, b[2], b[3]] for b in bbox] # 将bbox转换到原图坐标系下 [x,y,w,h] if cnt == 0: hand_tmp.append(bbox) else: dist = [(cx1-(b[0]+b[2]/2))**2 + (cy1-(b[1]+b[3]/2))**2 for b in bbox] # 计算每个bbox的中心到joint的距离 hand_tmp.append(self.expand_bbox(bbox[np.argmin(dist)], max_x1, max_y1)) # 选择距离joint最近的bbox) pose2d.append(pose_tmp) hand_bbox.append(hand_tmp) return pose2d, hand_bbox def _estimate_hand_pose(self, imgs, hand_bbox, camera_parameter, cam_id): img, bbox = imgs[cam_id], hand_bbox[cam_id] R = torch.tensor(camera_parameter['RT'][cam_id][:,:-1].astype(np.float32)) # cam_id视角的旋转矩阵 hand_pose = [] with torch.no_grad(): for b in bbox: if len(b) > 0: # crop img img_patch = cv2.resize(img[b[1]:b[1]+b[3], b[0]:b[0]+b[2], :], (int(cfg.input_img_shape[1]), int(cfg.input_img_shape[0])), interpolation=cv2.INTER_LINEAR) img_patch = img_patch.astype(np.float32) trans = transforms.ToTensor() input = trans(img_patch.astype(np.float32))/255. input = input.unsqueeze(0) inputs = {'img':input} out = self.tester.model(inputs, 'test') joint_coord_out = out['joint_coord'].cpu().numpy() hand_type_out = out['hand_type'].cpu().numpy() idx = np.argmax(hand_type_out) if hand_type_out[0][idx] < 0.8: # 忽略置信度小于0.8的结果 hand_pose.append([]) continue print(hand_type_out, idx) cur_pose = joint_coord_out[0, 21*idx:21*(idx+1), :] - joint_coord_out[0, 21*idx, :] # 选择置信度更高的pose,并平移到以手腕为坐标原点的位置 hand_pose.append(self.transform(R, torch.tensor(cur_pose)).numpy()) else: hand_pose.append([]) return hand_pose def _infer_single2d(self, imgs, img_id=0, dir='/home/jiangwen/tmp/Multi'): ''' img_id一直等于0 results[0].keys() = dict_keys(['image_id', 'category_id', 'score', 'keypoints', 'bbox', 'heatmaps', 'crops']) ''' info_dict = dict () for cam_id, img in enumerate ( imgs ): results = self.est2d.estimate_2d ( img, img_id ) # type(results): list # results = self.est2d.light_estimate_2d(img) this_info_dict = {'image_data': cv2.cvtColor ( img.copy (), cv2.COLOR_BGR2RGB )} this_info_dict[img_id] = list () for person_id, result in enumerate ( results ): # type(result): dict this_info_dict[img_id].append ( dict () ) this_info_dict[img_id][person_id]['pose2d'] = result['keypoints'] # len(result['keypoints]) = 51 -> (17*3) # NOTE: bbox is (x, y) (W, H) format where x and y is up-left point. this_info_dict[img_id][person_id]['bbox'] = result['bbox'] bb = np.array ( result['bbox'], dtype=int ) cropped_img = img[bb[1]:bb[1] + bb[3], bb[0]:bb[0] + bb[2]] # numpy format of crop idx is changed to json this_info_dict[img_id][person_id]['heatmap_bbox'] = result['crops'].astype ( int ).tolist () # = [x, y, x+w, y+h] this_info_dict[img_id][person_id]['heatmap_data'] = result['heatmaps'] # .shape = (17, h, w) this_info_dict[img_id][person_id]['cropped_img'] = cv2.cvtColor ( cropped_img.copy (), cv2.COLOR_BGR2RGB ) info_dict[cam_id] = this_info_dict return info_dict def _estimate3d(self, img_id, show=False, plt_id=0): data_batch = self.dataset[img_id] # ReID affinity matrix affinity_mat = self.extractor.get_affinity ( data_batch, rerank=self.cfg.rerank ) if self.cfg.rerank: affinity_mat = torch.from_numpy ( affinity_mat ) affinity_mat = torch.max ( affinity_mat, affinity_mat.t () ) affinity_mat = 1 - affinity_mat else: affinity_mat = affinity_mat.cpu () dimGroup = self.dataset.dimGroup[img_id] info_list = list () for cam_id in self.dataset.cam_names: info_list += self.dataset.info_dict[cam_id][img_id] # list of dict: 每个人一个dict # Geometry affinity matrix pose_mat = np.array ( [i['pose2d'] for i in info_list] ).reshape ( -1, model_cfg.joint_num, 3 )[..., :2] # shape = (累计人数, num_joint, 2) geo_affinity_mat = geometry_affinity ( pose_mat.copy (), self.dataset.F.numpy (), self.dataset.dimGroup[img_id] ) geo_affinity_mat = torch.tensor ( geo_affinity_mat ) if self.cfg.metric == 'geometry mean': W = torch.sqrt ( affinity_mat * geo_affinity_mat ) elif self.cfg.metric == 'circle': W = torch.sqrt ( (affinity_mat ** 2 + geo_affinity_mat ** 2) / 2 ) elif self.cfg.metric == 'Geometry only': W = torch.tensor ( geo_affinity_mat ) elif self.cfg.metric == 'ReID only': W = torch.tensor ( affinity_mat ) else: logger.critical ( 'Get into default option, are you intend to do it?' ) _alpha = 0.8 W = 0.8 * affinity_mat + (1 - _alpha) * geo_affinity_mat W[torch.isnan ( W )] = 0 # Some times (Shelf 452th img eg.) torch.sqrt will return nan if its too small sub_imgid2cam = np.zeros ( pose_mat.shape[0], dtype=np.int32 ) for idx, i in enumerate ( range ( len ( dimGroup ) - 1 ) ): sub_imgid2cam[dimGroup[i]:dimGroup[i + 1]] = idx # sub_imgid2cam[person_id] = cam_id num_person = 10 X0 = torch.rand ( W.shape[0], num_person ) # Use spectral method to initialize assignment matrix. if self.cfg.spectral: eig_value, eig_vector = W.eig ( eigenvectors=True ) _, eig_idx = torch.sort ( eig_value[:, 0], descending=True ) if W.shape[1] >= num_person: X0 = eig_vector[eig_idx[:num_person]].t () else: X0[:, :W.shape[1]] = eig_vector.t () match_mat = matchSVT ( W, dimGroup, alpha=self.cfg.alpha_SVT, _lambda=self.cfg.lambda_SVT, dual_stochastic_SVT=self.cfg.dual_stochastic_SVT ) bin_match = match_mat[:, torch.nonzero ( torch.sum ( match_mat, dim=0 ) > 1.9 ).squeeze ()] > 0.9 bin_match = bin_match.reshape ( W.shape[0], -1 ) matched_list = [[] for i in range ( bin_match.shape[1] )] for sub_imgid, row in enumerate ( bin_match ): if row.sum () != 0: pid = row.argmax () matched_list[pid].append ( sub_imgid ) matched_list = [np.array ( i ) for i in matched_list] if self.cfg.hybrid: multi_pose3d = self._hybrid_kernel ( matched_list, pose_mat, sub_imgid2cam, img_id ) chosen_img = [[]] * len ( sub_imgid2cam ) else: multi_pose3d, chosen_img = self._top_down_pose_kernel ( geo_affinity_mat, matched_list, pose_mat, sub_imgid2cam ) if show: # hybrid not implemented yet. bin_match = match_mat[:, torch.nonzero ( torch.sum ( match_mat, dim=0 ) > 0.9 ).squeeze ()] > 0.9 bin_match = bin_match.reshape ( W.shape[0], -1 ) matched_list = [[] for i in range ( bin_match.shape[1] )] for sub_imgid, row in enumerate ( bin_match ): if row.sum () != 0: pid = row.argmax () matched_list[pid].append ( sub_imgid ) matched_list = [np.array ( i ) for i in matched_list] show_panel_mem ( self.dataset, matched_list, info_list, sub_imgid2cam, img_id, affinity_mat, geo_affinity_mat, W, plt_id, multi_pose3d ) plotPaperRows ( self.dataset, matched_list, info_list, sub_imgid2cam, img_id, affinity_mat, geo_affinity_mat, W, plt_id, multi_pose3d ) return multi_pose3d def _hybrid_kernel(self, matched_list, pose_mat, sub_imgid2cam, img_id): return pictorial.hybrid_kernel ( self, matched_list, pose_mat, sub_imgid2cam, img_id ) multi_pose3d = list () for person in matched_list: # use bottom-up approach to get the 3D pose of person if person.shape[0] <= 1: continue # step1: use the 2D joint of person to triangulate the 3D joints candidates # person's 17 3D joints candidates candidates = np.zeros ( (17, person.shape[0] * (person.shape[0] - 1) // 2, 3) ) # 17xC^2_nx3 cnt = 0 for i in range ( person.shape[0] ): for j in range ( i + 1, person.shape[0] ): cam_id_i, cam_id_j = sub_imgid2cam[person[i]], sub_imgid2cam[person[j]] projmat_i, projmat_j = self.dataset.P[cam_id_i], self.dataset.P[cam_id_j] pose2d_i, pose2d_j = pose_mat[person[i]].T, pose_mat[person[j]].T pose3d_homo = cv2.triangulatePoints ( projmat_i, projmat_j, pose2d_i, pose2d_j ) pose3d_ij = pose3d_homo[:3] / pose3d_homo[3] candidates[:, cnt] += pose3d_ij.T cnt += 1 unary = self.dataset.get_unary ( person, sub_imgid2cam, candidates, img_id ) # step2: use the max-product algorithm to inference to get the 3d joint of the person # change the coco order coco_2_skel = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] candidates = np.array ( candidates )[coco_2_skel] unary = unary[coco_2_skel] skel = pictorial.getskel () # construct pictorial model edges = pictorial.getPictoStruct ( skel, self.dataset.distribution ) xp = pictorial.inferPict3D_MaxProd ( unary, edges, candidates ) human = np.array ( [candidates[i][j] for i, j in zip ( range ( xp.shape[0] ), xp )] ) human_coco = np.zeros ( (17, 3) ) human_coco[[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] = human human_coco[[1, 2, 3, 4]] = human_coco[0] # Just make visualize beauty not real ear and eye human_coco = human_coco.T if self.cfg.reprojection_refine and len ( person ) > 2: for joint_idx in range ( human_coco.shape[1] ): reprojected_error = np.zeros ( len ( person ) ) for idx, pid in enumerate ( person ): human_coco_homo = np.ones ( 4 ) human_coco_homo[:3] = human_coco[:, joint_idx] projected_pose_homo = self.dataset.P[sub_imgid2cam[pid]] @ human_coco_homo projected_pose = projected_pose_homo[:2] / projected_pose_homo[2] reprojected_error[idx] += np.linalg.norm ( projected_pose - pose_mat[pid, joint_idx] ) # import IPython; IPython.embed() # pose_select = reprojected_error < self.cfg.refine_threshold pose_select = ( reprojected_error - reprojected_error.mean ()) / reprojected_error.std () < self.cfg.refine_threshold if pose_select.sum () >= 2: Ps = list () Ys = list () for idx, is_selected in enumerate ( pose_select ): if is_selected: Ps.append ( self.dataset.P[sub_imgid2cam[person[idx]]] ) Ys.append ( pose_mat[person[idx], joint_idx].reshape ( -1, 1 ) ) Ps = torch.tensor ( Ps, dtype=torch.float32 ) Ys = torch.tensor ( Ys, dtype=torch.float32 ) Xs = multiTriIter ( Ps, Ys ) refined_pose = (Xs[:3] / Xs[3]).numpy () human_coco[:, joint_idx] = refined_pose.reshape ( -1 ) if True or check_bone_length ( human_coco ): multi_pose3d.append ( human_coco ) return multi_pose3d def _top_down_pose_kernel(self, geo_affinity_mat, matched_list, pose_mat, sub_imgid2cam): multi_pose3d = list () chosen_img = list () for person in matched_list: Graph = geo_affinity_mat[person][:, person].clone ().numpy () Graph *= (1 - np.eye ( Graph.shape[0] )) # make diagonal 0 if len ( Graph ) < 2: continue elif len ( Graph ) > 2: if self.cfg.use_mincut: cut0, cut1 = find_mincut ( Graph.copy () ) cut = cut0 if len ( cut0 ) > len ( cut1 ) else cut1 cut = cut.astype ( int ) sub_imageid = person[cut] else: sub_imageid = get_min_reprojection_error ( person, self.dataset, pose_mat, sub_imgid2cam ) else: sub_imageid = person _, rank = torch.sort ( geo_affinity_mat[sub_imageid][:, sub_imageid].sum ( dim=0 ) ) sub_imageid = sub_imageid[rank[:2]] cam_id_0, cam_id_1 = sub_imgid2cam[sub_imageid[0]], sub_imgid2cam[sub_imageid[1]] projmat_0, projmat_1 = self.dataset.P[cam_id_0], self.dataset.P[cam_id_1] pose2d_0, pose2d_1 = pose_mat[sub_imageid[0]].T, pose_mat[sub_imageid[1]].T pose3d_homo = cv2.triangulatePoints ( projmat_0, projmat_1, pose2d_0, pose2d_1 ) if self.cfg.use_bundle: pose3d_homo = bundle_adjustment ( pose3d_homo, person, self.dataset, pose_mat, sub_imgid2cam, logging=logger ) pose3d = pose3d_homo[:3] / (pose3d_homo[3] + 10e-6) # pose3d -= ((pose3d[:, 11] + pose3d[:, 12]) / 2).reshape ( 3, -1 ) # No need to normalize to hip if check_bone_length ( pose3d ): multi_pose3d.append ( pose3d ) else: # logging.info ( f'A pose proposal deleted on {img_id}:{person}' ) sub_imageid = list () pass chosen_img.append ( sub_imageid ) return multi_pose3d, chosen_img
class MultiEstimator (object): def __init__(self, cfg, debug=False): self.est2d = Estimator_2d(DEBUGGING=debug) self.extractor = FeatureExtractor() self.cfg = cfg self.dataset = None def predict(self, imgs, camera_parameter, template_name='Shelf', show=False, plt_id=0): info_dict = self._infer_single2d ( imgs ) self.dataset = MemDataset ( info_dict=info_dict, camera_parameter=camera_parameter, template_name=template_name ) return self._estimate3d ( 0, show=show, plt_id=plt_id ) def _infer_single2d(self, imgs, img_id=0, dir='/home/jiangwen/tmp/Multi'): info_dict = dict () for cam_id, img in enumerate ( imgs ): results = self.est2d.estimate_2d ( img, img_id ) this_info_dict = {'image_data': cv2.cvtColor ( img.copy (), cv2.COLOR_BGR2RGB )} this_info_dict[img_id] = list () for person_id, result in enumerate ( results ): this_info_dict[img_id].append ( dict () ) this_info_dict[img_id][person_id]['pose2d'] = result['keypoints'] # NOTE: bbox is (x, y) (W, H) format where x and y is up-left point. this_info_dict[img_id][person_id]['bbox'] = result['bbox'] bb = np.array ( result['bbox'], dtype=int ) cropped_img = img[bb[1]:bb[1] + bb[3], bb[0]:bb[0] + bb[2]] # numpy format of crop idx is changed to json this_info_dict[img_id][person_id]['heatmap_bbox'] = result['crops'].astype ( int ).tolist () this_info_dict[img_id][person_id]['heatmap_data'] = result['heatmaps'] this_info_dict[img_id][person_id]['cropped_img'] = cv2.cvtColor ( cropped_img.copy (), cv2.COLOR_BGR2RGB ) info_dict[cam_id] = this_info_dict return info_dict def _estimate3d(self, img_id, show=False, plt_id=0): data_batch = self.dataset[img_id] affinity_mat = self.extractor.get_affinity ( data_batch, rerank=self.cfg.rerank ) if self.cfg.rerank: affinity_mat = torch.from_numpy ( affinity_mat ) affinity_mat = torch.max ( affinity_mat, affinity_mat.t () ) affinity_mat = 1 - affinity_mat else: affinity_mat = affinity_mat.cpu () dimGroup = self.dataset.dimGroup[img_id] info_list = list () for cam_id in self.dataset.cam_names: info_list += self.dataset.info_dict[cam_id][img_id] pose_mat = np.array ( [i['pose2d'] for i in info_list] ).reshape ( -1, model_cfg.joint_num, 3 )[..., :2] geo_affinity_mat = geometry_affinity ( pose_mat.copy (), self.dataset.F.numpy (), self.dataset.dimGroup[img_id] ) geo_affinity_mat = torch.tensor ( geo_affinity_mat ) if self.cfg.metric == 'geometry mean': W = torch.sqrt ( affinity_mat * geo_affinity_mat ) elif self.cfg.metric == 'circle': W = torch.sqrt ( (affinity_mat ** 2 + geo_affinity_mat ** 2) / 2 ) elif self.cfg.metric == 'Geometry only': W = torch.tensor ( geo_affinity_mat ) elif self.cfg.metric == 'ReID only': W = torch.tensor ( affinity_mat ) else: logger.critical ( 'Get into default option, are you intend to do it?' ) _alpha = 0.3 W = 0.3 * affinity_mat + (1 - _alpha) * geo_affinity_mat W[torch.isnan ( W )] = 0 # Some times (Shelf 452th img eg.) torch.sqrt will return nan if its too small sub_imgid2cam = np.zeros ( pose_mat.shape[0], dtype=np.int32 ) for idx, i in enumerate ( range ( len ( dimGroup ) - 1 ) ): sub_imgid2cam[dimGroup[i]:dimGroup[i + 1]] = idx num_person = 10 X0 = torch.rand ( W.shape[0], num_person ) # Use spectral method to initialize assignment matrix. if self.cfg.spectral: eig_value, eig_vector = W.eig ( eigenvectors=True ) _, eig_idx = torch.sort ( eig_value[:, 0], descending=True ) if W.shape[1] >= num_person: X0 = eig_vector[eig_idx[:num_person]].t () else: X0[:, :W.shape[1]] = eig_vector.t () match_mat = matchSVT(W, dimGroup, alpha=self.cfg.alpha_SVT, _lambda=self.cfg.lambda_SVT, dual_stochastic_SVT=self.cfg.dual_stochastic_SVT) bin_match = match_mat[:, torch.nonzero ( torch.sum ( match_mat, dim=0 ) > 1.9 ).squeeze ()] > 0.9 bin_match = bin_match.reshape ( W.shape[0], -1 ) matched_list = [[] for i in range ( bin_match.shape[1] )] for sub_imgid, row in enumerate ( bin_match ): if row.sum () != 0: pid = row.argmax () matched_list[pid].append ( sub_imgid ) matched_list = [np.array ( i ) for i in matched_list] if self.cfg.hybrid: multi_pose3d = self._hybrid_kernel ( matched_list, pose_mat, sub_imgid2cam, img_id ) chosen_img = [[]] * len ( sub_imgid2cam ) else: multi_pose3d, chosen_img = self._top_down_pose_kernel ( geo_affinity_mat, matched_list, pose_mat, sub_imgid2cam ) if show: # hybrid not implemented yet. bin_match = match_mat[:, torch.nonzero ( torch.sum ( match_mat, dim=0 ) > 0.9 ).squeeze ()] > 0.9 bin_match = bin_match.reshape ( W.shape[0], -1 ) matched_list = [[] for i in range ( bin_match.shape[1] )] for sub_imgid, row in enumerate ( bin_match ): if row.sum () != 0: pid = row.argmax () matched_list[pid].append ( sub_imgid ) matched_list = [np.array ( i ) for i in matched_list] show_panel_mem ( self.dataset, matched_list, info_list, sub_imgid2cam, img_id, affinity_mat, geo_affinity_mat, W, plt_id, multi_pose3d ) plotPaperRows ( self.dataset, matched_list, info_list, sub_imgid2cam, img_id, affinity_mat, geo_affinity_mat, W, plt_id, multi_pose3d ) return multi_pose3d def _hybrid_kernel(self, matched_list, pose_mat, sub_imgid2cam, img_id): return pictorial.hybrid_kernel ( self, matched_list, pose_mat, sub_imgid2cam, img_id ) multi_pose3d = list () for person in matched_list: # use bottom-up approach to get the 3D pose of person if person.shape[0] <= 1: continue # step1: use the 2D joint of person to triangulate the 3D joints candidates # person's 17 3D joints candidates candidates = np.zeros ( (17, person.shape[0] * (person.shape[0] - 1) // 2, 3) ) # 17xC^2_nx3 cnt = 0 for i in range ( person.shape[0] ): for j in range ( i + 1, person.shape[0] ): cam_id_i, cam_id_j = sub_imgid2cam[person[i]], sub_imgid2cam[person[j]] projmat_i, projmat_j = self.dataset.P[cam_id_i], self.dataset.P[cam_id_j] pose2d_i, pose2d_j = pose_mat[person[i]].T, pose_mat[person[j]].T pose3d_homo = cv2.triangulatePoints ( projmat_i, projmat_j, pose2d_i, pose2d_j ) pose3d_ij = pose3d_homo[:3] / pose3d_homo[3] candidates[:, cnt] += pose3d_ij.T cnt += 1 unary = self.dataset.get_unary ( person, sub_imgid2cam, candidates, img_id ) # step2: use the max-product algorithm to inference to get the 3d joint of the person # change the coco order coco_2_skel = [0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] candidates = np.array ( candidates )[coco_2_skel] unary = unary[coco_2_skel] skel = pictorial.getskel () # construct pictorial model edges = pictorial.getPictoStruct ( skel, self.dataset.distribution ) xp = pictorial.inferPict3D_MaxProd ( unary, edges, candidates ) human = np.array ( [candidates[i][j] for i, j in zip ( range ( xp.shape[0] ), xp )] ) human_coco = np.zeros ( (17, 3) ) human_coco[[0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] = human human_coco[[1, 2, 3, 4]] = human_coco[0] # Just make visualize beauty not real ear and eye human_coco = human_coco.T if self.cfg.reprojection_refine and len ( person ) > 2: for joint_idx in range ( human_coco.shape[1] ): reprojected_error = np.zeros ( len ( person ) ) for idx, pid in enumerate ( person ): human_coco_homo = np.ones ( 4 ) human_coco_homo[:3] = human_coco[:, joint_idx] projected_pose_homo = self.dataset.P[sub_imgid2cam[pid]] @ human_coco_homo projected_pose = projected_pose_homo[:2] / projected_pose_homo[2] reprojected_error[idx] += np.linalg.norm ( projected_pose - pose_mat[pid, joint_idx] ) # import IPython; IPython.embed() # pose_select = reprojected_error < self.cfg.refine_threshold pose_select = (reprojected_error - reprojected_error.mean ()) / reprojected_error.std () < self.cfg.refine_threshold if pose_select.sum () >= 2: Ps = list () Ys = list () for idx, is_selected in enumerate ( pose_select ): if is_selected: Ps.append ( self.dataset.P[sub_imgid2cam[person[idx]]] ) Ys.append ( pose_mat[person[idx], joint_idx].reshape ( -1, 1 ) ) Ps = torch.tensor ( Ps, dtype=torch.float32 ) Ys = torch.tensor ( Ys, dtype=torch.float32 ) Xs = multiTriIter ( Ps, Ys ) refined_pose = (Xs[:3] / Xs[3]).numpy () human_coco[:, joint_idx] = refined_pose.reshape ( -1 ) if True or check_bone_length ( human_coco ): multi_pose3d.append ( human_coco ) return multi_pose3d def _top_down_pose_kernel(self, geo_affinity_mat, matched_list, pose_mat, sub_imgid2cam): multi_pose3d = list () chosen_img = list () for person in matched_list: Graph = geo_affinity_mat[person][:, person].clone ().numpy () Graph *= (1 - np.eye ( Graph.shape[0] )) # make diagonal 0 if len ( Graph ) < 2: continue elif len ( Graph ) > 2: if self.cfg.use_mincut: cut0, cut1 = find_mincut ( Graph.copy () ) cut = cut0 if len ( cut0 ) > len ( cut1 ) else cut1 cut = cut.astype ( int ) sub_imageid = person[cut] else: sub_imageid = get_min_reprojection_error ( person, self.dataset, pose_mat, sub_imgid2cam ) else: sub_imageid = person _, rank = torch.sort ( geo_affinity_mat[sub_imageid][:, sub_imageid].sum ( dim=0 ) ) sub_imageid = sub_imageid[rank[:2]] cam_id_0, cam_id_1 = sub_imgid2cam[sub_imageid[0]], sub_imgid2cam[sub_imageid[1]] projmat_0, projmat_1 = self.dataset.P[cam_id_0], self.dataset.P[cam_id_1] pose2d_0, pose2d_1 = pose_mat[sub_imageid[0]].T, pose_mat[sub_imageid[1]].T pose3d_homo = cv2.triangulatePoints ( projmat_0, projmat_1, pose2d_0, pose2d_1 ) if self.cfg.use_bundle: pose3d_homo = bundle_adjustment ( pose3d_homo, person, self.dataset, pose_mat, sub_imgid2cam, logging=logger ) pose3d = pose3d_homo[:3] / (pose3d_homo[3] + 10e-6) # pose3d -= ((pose3d[:, 11] + pose3d[:, 12]) / 2).reshape ( 3, -1 ) # No need to normalize to hip if check_bone_length ( pose3d ): multi_pose3d.append ( pose3d ) else: # logging.info ( f'A pose proposal deleted on {img_id}:{person}' ) sub_imageid = list () pass chosen_img.append ( sub_imageid ) return multi_pose3d, chosen_img
def evaluate(model, actor3D, range_, loader, is_info_dicts=False, dump_dir=None): #face un numpy array de 3 dimensiuni. actor3D e un array care are alti 3 array, fiecare din ei cu dimensiunea (2000,1). #asadar check_result are shape-ul (2000,3,10) check_result = np.zeros ( (len ( actor3D[0] ), len ( actor3D ), 10), dtype=np.int32 ) accuracy_cnt = 0 error_cnt = 0 for idx, imgs in enumerate ( tqdm ( loader ) ): img_id = range_[idx] try: if is_info_dicts: info_dicts = numpify ( imgs ) model.dataset = MemDataset ( info_dict=info_dicts, camera_parameter=camera_parameter, template_name='Unified' ) poses3d = model._estimate3d ( 0, show=False ) else: #face exact acelasi lucru ca si la demo # imgs e o lista cu 3 tensori. # un img_batch e un tensor din acestia 3, si are dimensiunea (288,360,3) si mai trebuie sa aflu de unde vine this_imgs = list () for img_batch in imgs: this_imgs.append ( img_batch.squeeze ().numpy () ) poses3d = model.predict ( imgs=this_imgs, camera_parameter=camera_parameter, template_name='Unified', show=False ) except Exception as e: logger.critical ( e ) poses3d = False for pid in range ( len ( actor3D ) ): if actor3D[pid][img_id][0].shape == (1, 0) or actor3D[pid][img_id][0].shape == (0, 0): continue if not poses3d: check_result[img_id, pid, :] = -1 logger.error ( f'Cannot get any pose in img:{img_id}' ) continue model_poses = np.stack ( [coco2shelf3D ( i ) for i in deepcopy ( poses3d )] ) gt_pose = actor3D[pid][img_id][0] dist = vectorize_distance ( np.expand_dims ( gt_pose, 0 ), model_poses ) model_pose = model_poses[np.argmin ( dist[0] )] bones = [[0, 1], [1, 2], [3, 4], [4, 5], [6, 7], [7, 8], [9, 10], [10, 11], [12, 13]] for i, bone in enumerate ( bones ): start_point, end_point = bone if is_right ( model_pose[start_point], model_pose[end_point], gt_pose[start_point], gt_pose[end_point] ): check_result[img_id, pid, i] = 1 accuracy_cnt += 1 else: check_result[img_id, pid, i] = -1 error_cnt += 1 gt_hip = (gt_pose[2] + gt_pose[3]) / 2 model_hip = (model_pose[2] + model_pose[3]) / 2 if is_right ( model_hip, model_pose[12], gt_hip, gt_pose[12] ): check_result[img_id, pid, -1] = 1 accuracy_cnt += 1 else: check_result[img_id, pid, -1] = -1 error_cnt += 1 bone_group = OrderedDict ( [('Head', np.array ( [8] )), ('Torso', np.array ( [9] )), ('Upper arms', np.array ( [5, 6] )), ('Lower arms', np.array ( [4, 7] )), ('Upper legs', np.array ( [1, 2] )), ('Lower legs', np.array ( [0, 3] ))] ) total_avg = np.sum ( check_result > 0 ) / np.sum ( np.abs ( check_result ) ) person_wise_avg = np.sum ( check_result > 0, axis=(0, 2) ) / np.sum ( np.abs ( check_result ), axis=(0, 2) ) bone_wise_result = OrderedDict () bone_person_wise_result = OrderedDict () for k, v in bone_group.items (): bone_wise_result[k] = np.sum ( check_result[:, :, v] > 0 ) / np.sum ( np.abs ( check_result[:, :, v] ) ) bone_person_wise_result[k] = np.sum ( check_result[:, :, v] > 0, axis=(0, 2) ) / np.sum ( np.abs ( check_result[:, :, v] ), axis=(0, 2) ) tb = PrettyTable () tb.field_names = ['Bone Group'] + [f'Actor {i}' for i in range ( bone_person_wise_result['Head'].shape[0] )] + [ 'Average'] list_tb = [tb.field_names] for k, v in bone_person_wise_result.items (): this_row = [k] + [np.char.mod ( '%.4f', i ) for i in v] + [np.char.mod ( '%.4f', np.sum ( v ) / len ( v ) )] list_tb.append ( [float ( i ) if isinstance ( i, type ( np.array ( [] ) ) ) else i for i in this_row] ) tb.add_row ( this_row ) this_row = ['Total'] + [np.char.mod ( '%.4f', i ) for i in person_wise_avg] + [ np.char.mod ( '%.4f', np.sum ( person_wise_avg ) / len ( person_wise_avg ) )] tb.add_row ( this_row ) list_tb.append ( [float ( i ) if isinstance ( i, type ( np.array ( [] ) ) ) else i for i in this_row] ) if dump_dir: np.save ( osp.join ( dump_dir, time.strftime ( str ( model_cfg.testing_on ) + "_%Y_%m_%d_%H_%M", time.localtime ( time.time () ) ) ), check_result ) with open ( osp.join ( dump_dir, time.strftime ( str ( model_cfg.testing_on ) + "_%Y_%m_%d_%H_%M.csv", time.localtime ( time.time () ) ) ), 'w' ) as f: writer = csv.writer ( f ) writer.writerows ( list_tb ) writer.writerow ( [model_cfg] ) print ( tb ) print ( model_cfg ) return check_result, list_tb