def test_area(self): """Test area.""" track = Track( Person([ KeyPoint(BodyPart(0), Point(0.1, 0.2), 1), KeyPoint(BodyPart(1), Point(0.3, 0.4), 0.9), KeyPoint(BodyPart(2), Point(0.4, 0.6), 0.9), KeyPoint(BodyPart(3), Point(0.7, 0.8), 0.1) ], Rectangle(Point(0, 0), Point(0, 0)), 1), 1000000) area = self.kpt_tracker._area(track) expected_area = (0.4 - 0.1) * (0.6 - 0.2) self.assertAlmostEqual(area, expected_area, 6)
def test_oks(self): """Test OKS.""" person = Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.8), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.1), KeyPoint(BodyPart(3), Point(0.8, 0.7), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1) track = Track( Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.8), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.9), KeyPoint(BodyPart(3), Point(0.8, 0.8), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1), 1000000) oks = self.kpt_tracker._object_keypoint_similarity(person, track) box_area = (0.8 - 0.2) * (0.8 - 0.2) x = 2 * self.tracker_config.keypoint_tracker_params.keypoint_falloff[3] d = 0.1 expected_oks = (1 + 1 + math.exp(-1 * (d**2) / (2 * box_area * (x**2)))) / 3 self.assertAlmostEqual(oks, expected_oks, 6)
def _detect_and_assert(self, detector, image, keypoints_truth): """Run pose estimation and assert if the result is close to ground truth.""" person = detector.detect(image) keypoints = person.keypoints for idx in range(len(BodyPart)): distance = np.linalg.norm( keypoints[idx].coordinate - keypoints_truth[idx], np.inf) self.assertGreaterEqual( _ALLOWED_DISTANCE, distance, '{0} is too far away ({1}) from ground truth data.'.format( BodyPart(idx).name, int(distance))) logging.debug('Detected %s close to expected result (%d)', BodyPart(idx).name, int(distance))
def _assert(self, keypoints: List[KeyPoint], keypoints_truth: np.ndarray) -> None: """Assert if the detection result is close to ground truth. Args: keypoints: List Keypoint detected by from the Movenet Multipose model. keypoints_truth: Ground truth keypoints. """ for idx in range(len(BodyPart)): kpt_estimate = np.array( [keypoints[idx].coordinate.x, keypoints[idx].coordinate.y]) kpt_truth = keypoints_truth[idx] distance = np.linalg.norm(kpt_estimate - kpt_truth, np.inf) self.assertGreaterEqual( _ALLOWED_DISTANCE, distance, '{0} is too far away ({1}) from ground truth data.'.format( BodyPart(idx).name, int(distance))) logging.debug('Detected %s close to expected result (%d)', BodyPart(idx).name, int(distance))
def _determine_torso_and_body_range(self, keypoints: np.ndarray, target_keypoints: Dict[(str, float)], center_y: float, center_x: float) -> List[float]: """Calculates the maximum distance from each keypoints to the center. The function returns the maximum distances from the two sets of keypoints: full 17 keypoints and 4 torso keypoints. The returned information will be used to determine the crop size. See determine_crop_region for more details. Args: keypoints: Detection result of Movenet model. target_keypoints: The 4 torso keypoints. center_y (float): Vertical coordinate of the body center. center_x (float): Horizontal coordinate of the body center. Returns: The maximum distance from each keypoints to the center location. """ torso_joints = [ BodyPart.LEFT_SHOULDER, BodyPart.RIGHT_SHOULDER, BodyPart.LEFT_HIP, BodyPart.RIGHT_HIP ] max_torso_yrange = 0.0 max_torso_xrange = 0.0 for joint in torso_joints: dist_y = abs(center_y - target_keypoints[joint][0]) dist_x = abs(center_x - target_keypoints[joint][1]) if dist_y > max_torso_yrange: max_torso_yrange = dist_y if dist_x > max_torso_xrange: max_torso_xrange = dist_x max_body_yrange = 0.0 max_body_xrange = 0.0 for idx in range(len(BodyPart)): if keypoints[BodyPart(idx).value, 2] < Movenet._MIN_CROP_KEYPOINT_SCORE: continue dist_y = abs(center_y - target_keypoints[joint][0]) dist_x = abs(center_x - target_keypoints[joint][1]) if dist_y > max_body_yrange: max_body_yrange = dist_y if dist_x > max_body_xrange: max_body_xrange = dist_x return [ max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange ]
def _detect_and_assert(self, detector: Movenet, image: np.ndarray, keypoints_truth: np.ndarray) -> None: """Run pose estimation and assert if the result is close to ground truth. Args: detector: A Movenet pose estimator. image: A [height, width, 3] RGB image. keypoints_truth: Ground truth keypoint coordinates to be compared to. """ person = detector.detect(image, reset_crop_region=True) keypoints = person.keypoints for idx in range(len(BodyPart)): kpt_estimate = np.array( [keypoints[idx].coordinate.x, keypoints[idx].coordinate.y]) distance = np.linalg.norm(kpt_estimate - keypoints_truth[idx], np.inf) self.assertGreaterEqual( _ALLOWED_DISTANCE, distance, '{0} is too far away ({1}) from ground truth data.'.format( BodyPart(idx).name, int(distance))) logging.debug('Detected %s close to expected result (%d)', BodyPart(idx).name, int(distance))
def test_oks_returns_zero(self): """Compute OKS returns 0.0 with less than 2 valid keypoints.""" person = Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.1), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.9), KeyPoint(BodyPart(3), Point(0.8, 0.8), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1) track = Track( Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.8), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.1), KeyPoint(BodyPart(3), Point(0.8, 0.8), 0.1) ], Rectangle(Point(0, 0), Point(0, 0)), 1), 1000000) oks = self.kpt_tracker._object_keypoint_similarity(person, track) self.assertAlmostEqual(oks, 0.0, 6)
def test_keypoint_tracker(self): """Test Keypoint tracker.""" # Timestamp: 0. Person becomes the only track persons = [ Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.8), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.9), KeyPoint(BodyPart(3), Point(0.8, 0.8), 0.0) ], Rectangle(Point(0, 0), Point(0, 0)), 1) ] persons = self.kpt_tracker.apply(persons, 0) tracks = self.kpt_tracker._tracks self.assertEqual(len(persons), 1) self.assertEqual(persons[0].id, 1) self.assertEqual(len(tracks), 1) self.assertEqual(tracks[0].person.id, 1) self.assertEqual(tracks[0].last_timestamp, 0) # Timestamp: 100000. First person is linked with track 1. Second person # spawns a new track (id = 2). persons = [ Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.8), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.9), KeyPoint(BodyPart(3), Point(0.8, 0.8), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1), Person( [ KeyPoint(BodyPart(0), Point(0.8, 0.8), 0.8), KeyPoint(BodyPart(1), Point(0.6, 0.6), 0.3), KeyPoint(BodyPart(2), Point(0.4, 0.4), 0.1), # Low confidence. KeyPoint(BodyPart(3), Point(0.2, 0.2), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1) ] persons = self.kpt_tracker.apply(persons, 100000) tracks = self.kpt_tracker._tracks self.assertEqual(len(persons), 2) self.assertEqual(persons[0].id, 1) self.assertEqual(persons[1].id, 2) self.assertEqual(len(tracks), 2) self.assertEqual(tracks[0].person.id, 1) self.assertEqual(tracks[0].last_timestamp, 100000) self.assertEqual(tracks[1].person.id, 2) self.assertEqual(tracks[1].last_timestamp, 100000) # Timestamp: 900000. First person is linked with track 2. Second person # spawns a new track (id = 3). persons = [ # Links with id = 2. Person( [ KeyPoint(BodyPart(0), Point(0.6, 0.7), 0.7), KeyPoint(BodyPart(1), Point(0.5, 0.6), 0.7), KeyPoint(BodyPart(2), Point(0.0, 0.0), 0.1), # Low confidence. KeyPoint(BodyPart(3), Point(0.2, 0.1), 1.0) ], Rectangle(Point(0, 0), Point(0, 0)), 1), # Becomes id = 3. Person( [ KeyPoint(BodyPart(0), Point(0.5, 0.1), 0.6), KeyPoint(BodyPart(1), Point(0.9, 0.3), 0.6), KeyPoint(BodyPart(2), Point(0.1, 0.1), 0.9), KeyPoint(BodyPart(3), Point(0.4, 0.4), 0.1) ], # Low confidence. Rectangle(Point(0, 0), Point(0, 0)), 1) ] persons = self.kpt_tracker.apply(persons, 900000) tracks = self.kpt_tracker._tracks self.assertEqual(len(persons), 2) self.assertEqual(persons[0].id, 2) self.assertEqual(persons[1].id, 3) self.assertEqual(len(tracks), 3) self.assertEqual(tracks[0].person.id, 2) self.assertEqual(tracks[0].last_timestamp, 900000) self.assertEqual(tracks[1].person.id, 3) self.assertEqual(tracks[1].last_timestamp, 900000) self.assertEqual(tracks[2].person.id, 1) self.assertEqual(tracks[2].last_timestamp, 100000) # Timestamp: 1200000. First person spawns a new track (id = 4), even though # it has the same keypoints as track 1. This is because the age exceeds # 1000 msec. The second person links with id 2. The third person spawns a # new track (id = 5). persons = [ # Becomes id = 4. Person([ KeyPoint(BodyPart(0), Point(0.2, 0.2), 1.0), KeyPoint(BodyPart(1), Point(0.4, 0.4), 0.8), KeyPoint(BodyPart(2), Point(0.6, 0.6), 0.9), KeyPoint(BodyPart(3), Point(0.8, 0.8), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1), # Links with id = 2. Person( [ KeyPoint(BodyPart(0), Point(0.55, 0.7), 0.7), KeyPoint(BodyPart(1), Point(0.5, 0.6), 0.9), KeyPoint(BodyPart(2), Point(1.0, 1.0), 0.1), # Low confidence. KeyPoint(BodyPart(3), Point(0.8, 0.1), 0.0) ], # Low confidence. Rectangle(Point(0, 0), Point(0, 0)), 1), # Becomes id = 5. Person( [ KeyPoint(BodyPart(0), Point(0.1, 0.1), 0.1), # Low confidence. KeyPoint(BodyPart(1), Point(0.2, 0.2), 0.9), KeyPoint(BodyPart(2), Point(0.3, 0.3), 0.7), KeyPoint(BodyPart(3), Point(0.4, 0.4), 0.8) ], Rectangle(Point(0, 0), Point(0, 0)), 1) ] persons = self.kpt_tracker.apply(persons, 1200000) tracks = self.kpt_tracker._tracks self.assertEqual(len(persons), 3) self.assertEqual(persons[0].id, 4) self.assertEqual(persons[1].id, 2) self.assertEqual(len(tracks), 4) self.assertEqual(tracks[0].person.id, 2) self.assertEqual(tracks[0].last_timestamp, 1200000) self.assertEqual(tracks[1].person.id, 4) self.assertEqual(tracks[1].last_timestamp, 1200000) self.assertEqual(tracks[2].person.id, 5) self.assertEqual(tracks[2].last_timestamp, 1200000) self.assertEqual(tracks[3].person.id, 3) self.assertEqual(tracks[3].last_timestamp, 900000) # Timestamp: 1300000. First person spawns a new track (id = 6). Since # max_tracks is 4, the oldest track (id = 3) is removed. persons = [ # Becomes id = 6. Person([ KeyPoint(BodyPart(0), Point(0.1, 0.8), 1.0), KeyPoint(BodyPart(1), Point(0.2, 0.9), 0.6), KeyPoint(BodyPart(2), Point(0.2, 0.9), 0.5), KeyPoint(BodyPart(3), Point(0.8, 0.2), 0.4) ], Rectangle(Point(0, 0), Point(0, 0)), 1) ] persons = self.kpt_tracker.apply(persons, 1300000) tracks = self.kpt_tracker._tracks self.assertEqual(len(persons), 1) self.assertEqual(persons[0].id, 6) self.assertEqual(len(tracks), 4) self.assertEqual(tracks[0].person.id, 6) self.assertEqual(tracks[0].last_timestamp, 1300000) self.assertEqual(tracks[1].person.id, 2) self.assertEqual(tracks[1].last_timestamp, 1200000) self.assertEqual(tracks[2].person.id, 4) self.assertEqual(tracks[2].last_timestamp, 1200000) self.assertEqual(tracks[3].person.id, 5) self.assertEqual(tracks[3].last_timestamp, 1200000)
def _determine_crop_region(self, keypoints: np.ndarray, image_height: int, image_width: int) -> Dict[(str, float)]: """Determines the region to crop the image for the model to run inference on. The algorithm uses the detected joints from the previous frame to estimate the square region that encloses the full body of the target person and centers at the midpoint of two hip joints. The crop size is determined by the distances between each joints and the center point. When the model is not confident with the four torso joint predictions, the function returns a default crop which is the full image padded to square. Args: keypoints: Detection result of Movenet model. image_height (int): The input image width image_width (int): The input image height Returns: crop_region (dict): The crop region to run inference on. """ # Convert keypoint index to human-readable names. target_keypoints = {} for idx in range(len(BodyPart)): target_keypoints[BodyPart(idx)] = [ keypoints[idx, 0] * image_height, keypoints[idx, 1] * image_width ] # Calculate crop region if the torso is visible. if self._torso_visible(keypoints): center_y = (target_keypoints[BodyPart.LEFT_HIP][0] + target_keypoints[BodyPart.RIGHT_HIP][0]) / 2 center_x = (target_keypoints[BodyPart.LEFT_HIP][1] + target_keypoints[BodyPart.RIGHT_HIP][1]) / 2 (max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange) = self._determine_torso_and_body_range( keypoints, target_keypoints, center_y, center_x) crop_length_half = np.amax([ max_torso_xrange * Movenet._TORSO_EXPANSION_RATIO, max_torso_yrange * Movenet._TORSO_EXPANSION_RATIO, max_body_yrange * Movenet._BODY_EXPANSION_RATIO, max_body_xrange * Movenet._BODY_EXPANSION_RATIO ]) # Adjust crop length so that it is still within the image border distances_to_border = np.array([ center_x, image_width - center_x, center_y, image_height - center_y ]) crop_length_half = np.amin( [crop_length_half, np.amax(distances_to_border)]) # If the body is large enough, there's no need to apply cropping logic. if crop_length_half > max(image_width, image_height) / 2: return self.init_crop_region(image_height, image_width) # Calculate the crop region that nicely covers the full body. else: crop_length = crop_length_half * 2 crop_corner = [ center_y - crop_length_half, center_x - crop_length_half ] return { 'y_min': crop_corner[0] / image_height, 'x_min': crop_corner[1] / image_width, 'y_max': (crop_corner[0] + crop_length) / image_height, 'x_max': (crop_corner[1] + crop_length) / image_width, 'height': (crop_corner[0] + crop_length) / image_height - crop_corner[0] / image_height, 'width': (crop_corner[1] + crop_length) / image_width - crop_corner[1] / image_width } # Return the initial crop regsion if the torso isn't visible. else: return self.init_crop_region(image_height, image_width)
def _postprocess(self, keypoints_with_scores: np.ndarray, image_height: int, image_width: int, detection_threshold: float) -> List[Person]: """Returns a list "Person" corresponding to the input image. Note that coordinates are expressed in (x, y) format for drawing utilities. Args: keypoints_with_scores: Output of the MultiPose TFLite model. image_height: height of the image in pixels. image_width: width of the image in pixels. detection_threshold: minimum confidence score for an entity to be considered. Returns: A list of Person(keypoints, bounding_box, scores), each containing: * the coordinates of all keypoints of the detected entity; * the bounding boxes of the entity. * the confidence core of the entity. """ _, num_instances, _ = keypoints_with_scores.shape list_persons = [] for idx in range(num_instances): # Skip a detected pose if its confidence score is below the threshold person_score = keypoints_with_scores[0, idx, 55] if person_score < detection_threshold: continue # Extract the keypoint coordinates and scores kpts_y = keypoints_with_scores[0, idx, range(0, 51, 3)] kpts_x = keypoints_with_scores[0, idx, range(1, 51, 3)] scores = keypoints_with_scores[0, idx, range(2, 51, 3)] # Create the list of keypoints keypoints = [] for i in range(scores.shape[0]): keypoints.append( KeyPoint( BodyPart(i), Point(int(kpts_x[i] * image_width), int(kpts_y[i] * image_height)), scores[i])) # Calculate the bounding box rect = [ keypoints_with_scores[0, idx, 51], keypoints_with_scores[0, idx, 52], keypoints_with_scores[0, idx, 53], keypoints_with_scores[0, idx, 54] ] bounding_box = Rectangle( Point(int(rect[1] * image_width), int(rect[0] * image_height)), Point(int(rect[3] * image_width), int(rect[2] * image_height))) # Create a Person instance corresponding to the detected entity. list_persons.append(Person(keypoints, bounding_box, person_score)) if self._tracker: list_persons = self._tracker.apply(list_persons, time.time() * 1000) return list_persons