def add_keypoint_rcnn_blobs( blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx ): """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where( np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) )[0] kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice( kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False ) sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones( (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype ) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints, sampled_fg_rois ) shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1) ) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights
def forward(self, inputs, outputs): data = inputs[0].data keypoint_probs = inputs[1].data keypoint_rois = inputs[2].data # output indicator resolution M = self.resolution up_scale = self.up_scale num_rois = keypoint_rois.shape[0] num_keypoints = keypoint_probs.shape[1] # first expand the keypoint rois height, width = data.shape[2], data.shape[3] pad_rois = box_utils.expand_boxes(keypoint_rois[:, 1:5], up_scale) pad_rois = box_utils.clip_boxes_to_image(pad_rois, height, width) # get keypoint predictions and their probs # output shape is (#rois, 3, #keypoints) and 3 means (x, y, prob) pred_rois = keypoint_utils.probs_to_keypoints(keypoint_probs, keypoint_rois) # map keypoint position to the pad_rois # output shape is (#rois, #keypoints), locations flatter out locations_on_pad_rois, _ = keypoint_utils.keypoints_to_heatmap_labels( pred_rois, pad_rois, M ) locations_on_pad_rois = locations_on_pad_rois.astype(np.int32) # and now generate keypoint indicators keypoint_indicators = blob_utils.zeros((num_rois, num_keypoints, M**2)) for i in range(num_rois): locations = locations_on_pad_rois[i] # shape (#keypoints, ) for k in range(num_keypoints): keypoint_indicators[i, k, locations[k]] = pred_rois[i, 2, k] # and reshape to 4 dimension keypoint_indicators = keypoint_indicators.reshape( (num_rois, num_keypoints, M, M) ) outputs[0].reshape(keypoint_indicators.shape) outputs[0].data[...] = keypoint_indicators
def _sample_human_object(rois, rois_to_gt_ind, roidb, im_info): ''' Sample human rois and target_object rois :param rois: rois correspond to feature map :param rois_to_gt_ind: :param roidb: box correspond to origin image :return: ''' # ipdb.set_trace() human_num_per_image = int(cfg.VCOCO.HUMAN_NUM_PER_IM) target_object_num_per_image = int(cfg.VCOCO.TARGET_OBJECT_NUM_PER_IM) kp_human_num_per_image = int(cfg.VCOCO.KP_HUMAN_NUM_PER_IM) # Add keypoints all_human_gt_inds = np.where(roidb['gt_classes'] == 1)[0] gt_keypoints = roidb['gt_keypoints'] # get gt human ids that with action # ToDo: name change # add all human(even without action) to human-centric branch human_with_action_gt_inds = np.where(roidb['gt_actions'][:, 0] >= 0)[0] gt_objects_num = roidb['gt_actions'].shape[0] # human_with_action_gt_inds = np.where(roidb['gt_classes'][:gt_objects_num] == 1)[0] # gt_boxes, for calculating action targets location # roidb['boxes'] = gt_boxes + scaled_rois(from RPN module) # ipdb.set_trace() gt_boxes = roidb['boxes'][:gt_objects_num, :] # ------------------------------------------------------------------------- # Human-Centric Branch: sample human rois and calculate targets # ------------------------------------------------------------------------- # get proposals(rois) that assigned to gt human with action # and corresponding target_objects rois_human_with_action_inds = [] rois_human_without_action_inds = [] for human_gt_i in all_human_gt_inds: if human_gt_i in human_with_action_gt_inds: rois_human_with_action_inds.append( np.where(rois_to_gt_ind == human_gt_i)[0]) else: rois_human_without_action_inds.append( np.where(rois_to_gt_ind == human_gt_i)[0]) rois_human_with_action_inds = np.concatenate(rois_human_with_action_inds) # select 16 rois of human human_num_this_image = min(human_num_per_image, rois_human_with_action_inds.size) if rois_human_with_action_inds.size > 0: rois_human_with_action_inds = npr.choice(rois_human_with_action_inds, size=human_num_this_image, replace=False) if cfg.VCOCO.KEYPOINTS_ON: if len(rois_human_without_action_inds) > 0: rois_human_without_action_inds = np.concatenate( rois_human_without_action_inds) human_num_without_action = min( kp_human_num_per_image - rois_human_with_action_inds.size, rois_human_without_action_inds.size) rois_human_without_action_inds = npr.choice( rois_human_without_action_inds, size=human_num_without_action, replace=False) rois_kp_inds = np.concatenate( [rois_human_with_action_inds, rois_human_without_action_inds]) kp_inds_of_sampled_rois = np.zeros(rois_kp_inds.size, dtype=np.int32) kp_inds_of_sampled_rois[:rois_human_with_action_inds.size] = 1 else: rois_kp_inds = rois_human_with_action_inds kp_inds_of_sampled_rois = np.ones(rois_human_with_action_inds.size, dtype=np.int32) sampled_kp_rois = rois[rois_kp_inds] sampled_keypoints = gt_keypoints[rois_to_gt_ind[rois_kp_inds]] heats, kp_weights = keypoints_to_heatmap_labels( sampled_keypoints, sampled_kp_rois[:, 1:] / float(im_info[2])) shape = (sampled_kp_rois.shape[0] * gt_keypoints.shape[2], ) heats = heats.reshape(shape) kp_weights = kp_weights.reshape(shape) min_count = cfg.KRCNN.MIN_KEYPOINT_COUNT_FOR_VALID_MINIBATCH num_visible_keypoints = np.sum(kp_weights) kp_norm = num_visible_keypoints / ( cfg.TRAIN.IMS_PER_BATCH * cfg.TRAIN.BATCH_SIZE_PER_IM * cfg.TRAIN.FG_FRACTION * cfg.KRCNN.NUM_KEYPOINTS) # get human action targets relative location human_rois = rois[rois_human_with_action_inds] human_action_labels = roidb['gt_actions'][ rois_to_gt_ind[rois_human_with_action_inds]] human_action_labels[human_action_labels < 0] = 0 rois_human_role_ids = roidb['gt_role_id'][ rois_to_gt_ind[rois_human_with_action_inds]] # scale rois to original image size human_action_targets, action_target_weights = \ _compute_action_targets(human_rois[:, 1:]/float(im_info[2]), gt_boxes, rois_human_role_ids) # ------------------------------------------------------------------------- # Interaction Branch: sample target_object rois and sample positive triplets # ------------------------------------------------------------------------- # Select role objects # # get gt role object inds target_object_gt_inds = np.unique(rois_human_role_ids) target_object_gt_inds = target_object_gt_inds[np.where( target_object_gt_inds > -1)] # get rois that assigned to gt role object if target_object_gt_inds.size > 0: rois_target_object_inds = [] for role_gt_i in target_object_gt_inds: rois_target_object_inds.append( np.where(rois_to_gt_ind == role_gt_i)[0]) rois_target_object_inds = np.concatenate(rois_target_object_inds) else: # some actions don't have target_objects rois_target_object_inds = np.empty((0, ), dtype=np.int64) # select 32 role objects # ToDo: 32 or no limitation? # min(target_object_num_per_image, rois_target_object_inds.size) target_object_num_this_image = rois_target_object_inds.size if rois_target_object_inds.size > 0: rois_target_object_inds = npr.choice(rois_target_object_inds, size=target_object_num_this_image, replace=False) target_object_rois = rois[rois_target_object_inds] # target_object_feature_mapping_index = mapping_original_inds[rois_target_object_inds] # Sample positive triplets # human_rois_inds, target_object_rois_inds, interaction_action_labels = \ generate_positive_triplets(rois_human_with_action_inds, rois_target_object_inds, rois_to_gt_ind, roidb['gt_role_id']) interaction_batch_idx = np.full_like(human_rois_inds, rois[0, 0], dtype=np.int32) sampled_rois = np.vstack((human_rois, target_object_rois)) human_inds_of_sampled_rois = np.zeros(sampled_rois.shape[0], dtype=np.int32) human_inds_of_sampled_rois[:human_rois.shape[0]] = 1 if not cfg.VCOCO.KEYPOINTS_ON: return_dict = dict( boxes=sampled_rois, human_inds_of_sampled_boxes=human_inds_of_sampled_rois, human_action_labels=human_action_labels, human_action_targets=human_action_targets, action_target_weights=action_target_weights, interaction_human_inds=human_rois_inds, interaction_target_object_inds=target_object_rois_inds, interaction_action_labels=interaction_action_labels, interaction_batch_idx=interaction_batch_idx) else: return_dict = dict( boxes=sampled_rois, human_inds_of_sampled_boxes=human_inds_of_sampled_rois, human_action_labels=human_action_labels, human_action_targets=human_action_targets, action_target_weights=action_target_weights, interaction_human_inds=human_rois_inds, interaction_target_object_inds=target_object_rois_inds, interaction_action_labels=interaction_action_labels, interaction_batch_idx=interaction_batch_idx, keypoint_rois=sampled_kp_rois, keypoint_locations_int32=heats.astype(np.int32, copy=False), keypoint_weights=kp_weights, keypoint_loss_normalizer=np.array([kp_norm], dtype=np.float32), ) return return_dict
def add_keypoint_rcnn_blobs_softmax( blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx ): """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where( np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) )[0] kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice( kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False ) if kp_fg_inds.shape[0] > 0: sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones( (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype ) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints, sampled_fg_rois, M=cfg.KRCNN.HEATMAP_SIZE ) shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) else:# If there are no fg keypoint rois (it does happen) # The network cannot handle empty blobs, so we must provide a heatmap # We simply take the first bg roi, given it an all zero heatmap, and # set its weights to zero (ignore label). roi_inds = np.where(roidb['gt_classes'] == 0)[0] # sampled_fg_rois is actually one random roi, but that's ok because ... sampled_fg_rois = roidb['boxes'][roi_inds[0]].reshape((1, -1)) # We give it an 0's blob heats = blob_utils.zeros((1 * cfg.KRCNN.NUM_KEYPOINTS, 1)) # We set weights to 0 (ignore label) weights = blob_utils.zeros((1 * cfg.KRCNN.NUM_KEYPOINTS, 1)) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1) ) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights # Since in this function we may random sample a subset of bbox as the roi, # we need to make sure it's the same subset for the refined_keypoint_rois, # so we pass out the inds for the subset too. blobs['keypoint_fg_inds'] = kp_fg_inds.astype(np.int32, copy=False)
def add_refine_keypoints_blobs_softmax(blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx, data): """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_keypoints = roidb['gt_keypoints'] # Load the kp_fg_inds generated by keypoint_rcnn.py. So we avoid the issue # of mismatched keypoint_rois and refined_keypoint_rois, which cause a big # issue for training. kp_fg_inds = blobs['keypoint_fg_inds'] if kp_fg_inds.shape[0] > 0: sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] # Let's expand the rois up_scale = cfg.REFINENET.UP_SCALE inp_h, inp_w = data.shape[2], data.shape[3] pad_img_h, pad_img_w = inp_h / im_scale, inp_w / im_scale pad_fg_rois = box_utils.expand_boxes(sampled_fg_rois, up_scale) pad_fg_rois = box_utils.clip_boxes_to_image(pad_fg_rois, pad_img_h, pad_img_w) num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones( (len(pad_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype) for ii in range(len(pad_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints, pad_fg_rois, M=cfg.REFINENET.KRCNN.HEATMAP_SIZE) shape = (pad_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) else: # If there are no fg keypoint rois (it does happen) # The network cannot handle empty blobs, so we must provide a heatmap # We simply take the first bg roi, given it an all zero heatmap, and # set its weights to zero (ignore label). roi_inds = np.where(roidb['gt_classes'] == 0)[0] # sampled_fg_rois is actually one random roi, but that's ok because ... pad_fg_rois = roidb['boxes'][roi_inds[0]].reshape((1, -1)) # We give it an 0's blob heats = blob_utils.zeros((1 * cfg.KRCNN.NUM_KEYPOINTS, 1)) # We set weights to 0 (ignore label) weights = blob_utils.zeros((1 * cfg.KRCNN.NUM_KEYPOINTS, 1)) pad_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((pad_fg_rois.shape[0], 1)) pad_fg_rois = np.hstack((repeated_batch_idx, pad_fg_rois)) blobs['refined_keypoint_rois'] = pad_fg_rois blobs['refined_keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['refined_keypoint_weights'] = weights
def add_keypoint_rcnn_blobs( blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx): # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where( np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible))[0] kp_fg_rois_per_this_image = np.minimum( fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice( kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False) if kp_fg_inds.shape[0] == 0: kp_fg_inds = gt_inds sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[-1] sampled_keypoints = -np.ones( (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] # assert np.sum(sampled_keypoints[ii, 2, :]) > 0 all_heats = [] all_weights = [] time_dim = sampled_fg_rois.shape[-1] // 4 per_frame_nkps = num_keypoints // time_dim for t in range(time_dim): heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints[..., t * per_frame_nkps: (t + 1) * per_frame_nkps], sampled_fg_rois[..., t * 4: (t + 1) * 4]) all_heats.append(heats) all_weights.append(weights) heats = np.concatenate(all_heats, axis=-1) weights = np.concatenate(all_weights, axis=-1) shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS * time_dim, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1)) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights