def accuracy_pixel(output, meta_data, cfgs=None, image_size=(256.0, 256.0), arg_max='hard'): """ pixel-wise distance computed from predicted heatmaps """ # report distance in terms of pixel in the original image if arg_max == 'soft': if isinstance(output, np.ndarray): pred, max_vals = lip.get_max_preds_soft(output) else: pred, max_vals = lip.get_max_preds_soft_pt(output) elif arg_max == 'hard': if not isinstance(output, np.ndarray): output = output.data.cpu().numpy() pred, max_vals = lip.get_max_preds(output) else: raise NotImplementedError image_size = image_size if cfgs is None else cfgs['heatmapModel'][ 'input_size'] # TODO: check the target generation and coordinate mapping # multiply by down-sample ratio if not isinstance(pred, np.ndarray): pred = pred.data.cpu().numpy() max_vals = max_vals.data.cpu().numpy() pred *= image_size[0] / output.shape[3] # inverse transform and compare pixel didstance centers, scales, rots = meta_data['center'], meta_data['scale'], meta_data[ 'rotation'] centers = centers.data.cpu().numpy() scales = scales.data.cpu().numpy() rots = rots.data.cpu().numpy() joints_original_batch = meta_data['original_joints'].data.cpu().numpy() distance_list = [] all_src_coordinates = [] for sample_idx in range(len(pred)): trans_inv = lip.get_affine_transform(centers[sample_idx], scales[sample_idx], rots[sample_idx], image_size, inv=1) joints_original = joints_original_batch[sample_idx] pred_src_coordinates = lip.affine_transform_modified( pred[sample_idx], trans_inv) all_src_coordinates.append( pred_src_coordinates.reshape(1, len(pred_src_coordinates), 2)) distance_list += get_distance(joints_original, pred_src_coordinates) cnt = len(distance_list) avg_acc = sum(distance_list) / cnt others = { 'src_coord': np.concatenate(all_src_coordinates, axis=0), 'joints_pred': pred, 'max_vals': max_vals } return avg_acc, cnt, others
def get_keypoints(self, instances, records, is_cuda=True): """ Foward pass to obtain the screen coordinates. """ if is_cuda: instances = instances.cuda() output = self.HC(instances) # local part coordinates width, height = self.resolution local_coord = output[1].data.cpu().numpy() local_coord *= np.array(self.resolution).reshape(1, 1, 2) # transform local part coordinates to screen coordinates centers = [records[i]['center'] for i in range(len(records))] scales = [records[i]['scale'] for i in range(len(records))] rots = [records[i]['rotation'] for i in range(len(records))] for instance_idx in range(len(local_coord)): trans_inv = get_affine_transform(centers[instance_idx], scales[instance_idx], rots[instance_idx], (height, width), inv=1) screen_coord = affine_transform_modified(local_coord[instance_idx], trans_inv) records[instance_idx]['kpts'] = screen_coord # assemble a dictionary where each key corresponds to one image ret = {} for record in records: path = record['path'] if path not in ret: ret[path] = self.new_img_dict() ret[path]['kpts_2d_pred'].append(record['kpts'].reshape(1, -1)) ret[path]['center'].append(record['center']) ret[path]['scale'].append(record['scale']) ret[path]['bbox_resize'].append(record['bbox_resize']) ret[path]['label'].append(record['label']) ret[path]['score'].append(record['score']) ret[path]['rotation'].append(record['rotation']) return ret
def get_keypoints(instances, records, model, image_size=(256,256), arg_max='hard', is_cuda=True ): """ Foward pass to obtain the screen coordinates. """ if is_cuda: instances = instances.cuda() output = model(instances) if type(output) is tuple: pred, max_vals = output[1].data.cpu().numpy(), None elif arg_max == 'hard': if not isinstance(output, np.ndarray): output = output.data.cpu().numpy() pred, max_vals = get_max_preds(output) else: raise NotImplementedError if type(output) is tuple: pred *= image_size[0] else: pred *= image_size[0]/output.shape[3] centers = [records[i]['center'] for i in range(len(records))] scales = [records[i]['scale'] for i in range(len(records))] rots = [records[i]['rotation'] for i in range(len(records))] for sample_idx in range(len(pred)): trans_inv = get_affine_transform(centers[sample_idx], scales[sample_idx], rots[sample_idx], image_size, inv=1) pred_src_coordinates = affine_transform_modified(pred[sample_idx], trans_inv) record = records[sample_idx] # pred_src_coordinates += np.array([[record['bbox'][0], record['bbox'][1]]]) records[sample_idx]['kpts'] = pred_src_coordinates # assemble a dictionary where each key corresponds to one image ret = {} for record in records: path = record['path'] if path not in ret: ret[path] = {'center':[], 'scale':[], 'rotation':[], 'bbox_resize':[], # resized bounding box 'kpts_2d_pred':[], 'label':[], 'score':[] } ret[path]['kpts_2d_pred'].append(record['kpts'].reshape(1, -1)) ret[path]['center'].append(record['center']) ret[path]['scale'].append(record['scale']) ret[path]['bbox_resize'].append(record['bbox_resize']) ret[path]['label'].append(record['label']) ret[path]['score'].append(record['score']) ret[path]['rotation'].append(record['rotation']) return ret
def get_distance_src(output, meta_data, cfgs=None, image_size = (256.0, 256.0), arg_max='hard' ): """ From predicted heatmaps, obtain local coordinates (\phi_l in the paper) and transform them back to the source images based on metadata. Error is then evaluated on the source image for the screen coordinates (\phi_g in the paper). """ # the error is reported as distance in terms of pixels in the source image if type(output) is tuple: pred, max_vals = output[1].data.cpu().numpy(), None elif isinstance(output, np.ndarray) and arg_max == 'soft': pred, max_vals = lip.soft_arg_max_np(output) elif isinstance(output, torch.Tensor) and arg_max == 'soft': pred, max_vals = lip.soft_arg_max(output) elif isinstance(output, np.ndarray) or isinstance(output, torch.Tensor) and arg_max == 'hard': if not isinstance(output, np.ndarray): output = output.data.cpu().numpy() pred, max_vals = lip.get_max_preds(output) else: raise NotImplementedError image_size = image_size if cfgs is None else cfgs['heatmapModel']['input_size'] width, height = image_size # multiply by down-sample ratio if not isinstance(pred, np.ndarray): pred = pred.data.cpu().numpy() if (max_vals is not None) and (not isinstance(max_vals, np.ndarray)): max_vals = max_vals.data.cpu().numpy() # the coordinates need to be rescaled for different cases if type(output) is tuple: pred *= np.array(image_size).reshape(1, 1, 2) else: pred *= image_size[0] / output.shape[3] # inverse transform and compare pixel didstance centers, scales = meta_data['center'], meta_data['scale'] # some predictions are generated for unlabeled data if len(pred) != len(centers): pred_used = pred[:len(centers)] else: pred_used = pred if 'rotation' in meta_data: rots = meta_data['rotation'] else: rots = [0. for i in range(len(centers))] joints_original_batch = meta_data['original_joints'] distance_list = [] correct_cnt_sum = np.zeros((len(PCK_THRES))) all_src_coordinates = [] for sample_idx in range(len(pred_used)): trans_inv = lip.get_affine_transform(centers[sample_idx], scales[sample_idx], rots[sample_idx], (height, width), inv=1 ) joints_original = joints_original_batch[sample_idx] pred_src_coordinates = lip.affine_transform_modified(pred_used[sample_idx], trans_inv ) all_src_coordinates.append(pred_src_coordinates.reshape(1, len(pred_src_coordinates), 2)) distance_list += get_distance(joints_original, pred_src_coordinates) correct_cnt_sum += get_PCK(pred_src_coordinates, joints_original) cnt = len(distance_list) avg_acc = sum(distance_list) / cnt others = { 'src_coord': np.concatenate(all_src_coordinates, axis=0), # screen coordinates 'joints_pred': pred, # predicted local coordinates 'max_vals': max_vals, 'correct_cnt': correct_cnt_sum, 'PCK_batch': correct_cnt_sum / cnt } return avg_acc, cnt, others