def _update_track_embedding(self, track_instances: Instances) -> Instances: if len(track_instances) == 0: return track_instances dim = track_instances.query_pos.shape[1] out_embed = track_instances.output_embedding query_pos = track_instances.query_pos[:, :dim // 2] query_feat = track_instances.query_pos[:, dim // 2:] q = k = query_pos + out_embed tgt = out_embed tgt2 = self.self_attn(q[:, None], k[:, None], value=tgt[:, None])[0][:, 0] tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) if self.update_query_pos: query_pos2 = self.linear_pos2( self.dropout_pos1(self.activation(self.linear_pos1(tgt)))) query_pos = query_pos + self.dropout_pos2(query_pos2) query_pos = self.norm_pos(query_pos) track_instances.query_pos[:, :dim // 2] = query_pos query_feat2 = self.linear_feat2( self.dropout_feat1(self.activation(self.linear_feat1(tgt)))) query_feat = query_feat + self.dropout_feat2(query_feat2) query_feat = self.norm_feat(query_feat) track_instances.query_pos[:, dim // 2:] = query_feat track_instances.ref_pts = inverse_sigmoid( track_instances.pred_boxes[:, :2].detach().clone()) return track_instances
def update(self, track_instances: Instances): track_instances.disappear_time[track_instances.scores >= self.score_thresh] = 0 for i in range(len(track_instances)): if track_instances.obj_idxes[i] == -1 and track_instances.scores[i] >= self.score_thresh: # print("track {} has score {}, assign obj_id {}".format(i, track_instances.scores[i], self.max_obj_id)) track_instances.obj_idxes[i] = self.max_obj_id self.max_obj_id += 1 elif track_instances.obj_idxes[i] >= 0 and track_instances.scores[i] < self.filter_score_thresh: track_instances.disappear_time[i] += 1 if track_instances.disappear_time[i] >= self.miss_tolerance: # Set the obj_id to -1. # Then this track will be removed by TrackEmbeddingLayer. track_instances.obj_idxes[i] = -1
def _add_fp_tracks(self, track_instances: Instances, active_track_instances: Instances) -> Instances: inactive_instances = track_instances[track_instances.obj_idxes < 0] # add fp for each active track in a specific probability. fp_prob = torch.ones_like( active_track_instances.scores) * self.fp_ratio selected_active_track_instances = active_track_instances[ torch.bernoulli(fp_prob).bool()] if len(inactive_instances) > 0 and len( selected_active_track_instances) > 0: num_fp = len(selected_active_track_instances) if num_fp >= len(inactive_instances): fp_track_instances = inactive_instances else: inactive_boxes = Boxes( box_ops.box_cxcywh_to_xyxy(inactive_instances.pred_boxes)) selected_active_boxes = Boxes( box_ops.box_cxcywh_to_xyxy( selected_active_track_instances.pred_boxes)) ious = pairwise_iou(inactive_boxes, selected_active_boxes) # select the fp with the largest IoU for each active track. fp_indexes = ious.max(dim=0).indices # remove duplicate fp. fp_indexes = torch.unique(fp_indexes) fp_track_instances = inactive_instances[fp_indexes] merged_track_instances = Instances.cat( [active_track_instances, fp_track_instances]) return merged_track_instances return active_track_instances
def forward(self, data) -> Instances: active_track_instances = self._select_active_tracks(data) active_track_instances = self._update_track_embedding( active_track_instances) init_track_instances: Instances = data['init_track_instances'] merged_track_instances = Instances.cat( [init_track_instances, active_track_instances]) return merged_track_instances
def forward(self, track_instances: Instances, target_size) -> Instances: """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch For evaluation, this must be the original image size (before any data augmentation) For visualization, this should be the image size after data augment, but before padding """ out_logits = track_instances.pred_logits out_bbox = track_instances.pred_boxes prob = out_logits.sigmoid() # prob = out_logits[...,:1].sigmoid() scores, labels = prob.max(-1) # convert to [x0, y0, x1, y1] format boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) # and from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_size scale_fct = torch.Tensor([img_w, img_h, img_w, img_h]).to(boxes) boxes = boxes * scale_fct[None, :] track_instances.boxes = boxes track_instances.scores = scores track_instances.labels = labels track_instances.remove('pred_logits') track_instances.remove('pred_boxes') return track_instances
def visualize_img_with_bbox(img_path, img, dt_instances: Instances, ref_pts=None, gt_boxes=None): if dt_instances.has('scores'): img_show = draw_bboxes(img, np.concatenate([dt_instances.boxes, dt_instances.scores.reshape(-1, 1)], axis=-1), dt_instances.obj_idxes) else: img_show = draw_bboxes(img, dt_instances.boxes, dt_instances.obj_idxes) if ref_pts is not None: img_show = draw_points(img_show, ref_pts) if gt_boxes is not None: img_show = draw_bboxes(img_show, gt_boxes, identities=np.ones((len(gt_boxes), )) * -1) cv2.imwrite(img_path, img_show)
def fn(frame, *args): frame = nested_tensor_from_tensor_list([frame]) tmp = Instances((1, 1), **dict(zip(keys, args))) frame_res = self._forward_single_image(frame, tmp) return ( frame_res['pred_logits'], frame_res['pred_boxes'], frame_res['ref_pts'], frame_res['hs'], *[aux['pred_logits'] for aux in frame_res['aux_outputs']], *[aux['pred_boxes'] for aux in frame_res['aux_outputs']] )
def _targets_to_instances(targets: dict, img_shape) -> Instances: gt_instances = Instances(tuple(img_shape)) gt_instances.boxes = targets['boxes'] gt_instances.labels = targets['labels'] gt_instances.obj_ids = targets['obj_ids'] gt_instances.area = targets['area'] return gt_instances
def _generate_empty_tracks(self): track_instances = Instances((1, 1)) num_queries, dim = self.query_embed.weight.shape # (300, 512) device = self.query_embed.weight.device track_instances.ref_pts = self.transformer.reference_points(self.query_embed.weight[:, :dim // 2]) track_instances.query_pos = self.query_embed.weight track_instances.output_embedding = torch.zeros((num_queries, dim >> 1), device=device) track_instances.obj_idxes = torch.full((len(track_instances),), -1, dtype=torch.long, device=device) track_instances.matched_gt_idxes = torch.full((len(track_instances),), -1, dtype=torch.long, device=device) track_instances.disappear_time = torch.zeros((len(track_instances), ), dtype=torch.long, device=device) track_instances.iou = torch.zeros((len(track_instances),), dtype=torch.float, device=device) track_instances.scores = torch.zeros((len(track_instances),), dtype=torch.float, device=device) track_instances.track_scores = torch.zeros((len(track_instances),), dtype=torch.float, device=device) track_instances.pred_boxes = torch.zeros((len(track_instances), 4), dtype=torch.float, device=device) track_instances.pred_logits = torch.zeros((len(track_instances), self.num_classes), dtype=torch.float, device=device) mem_bank_len = self.mem_bank_len track_instances.mem_bank = torch.zeros((len(track_instances), mem_bank_len, dim // 2), dtype=torch.float32, device=device) track_instances.mem_padding_mask = torch.ones((len(track_instances), mem_bank_len), dtype=torch.bool, device=device) track_instances.save_period = torch.zeros((len(track_instances), ), dtype=torch.float32, device=device) return track_instances.to(self.query_embed.weight.device)
def _forward_single_image(self, samples, track_instances: Instances): features, pos = self.backbone(samples) src, mask = features[-1].decompose() assert mask is not None srcs = [] masks = [] for l, feat in enumerate(features): src, mask = feat.decompose() srcs.append(self.input_proj[l](src)) masks.append(mask) assert mask is not None if self.num_feature_levels > len(srcs): _len_srcs = len(srcs) for l in range(_len_srcs, self.num_feature_levels): if l == _len_srcs: src = self.input_proj[l](features[-1].tensors) else: src = self.input_proj[l](srcs[-1]) m = samples.mask mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) srcs.append(src) masks.append(mask) pos.append(pos_l) hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer( srcs, masks, pos, track_instances.query_pos, ref_pts=track_instances.ref_pts) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.bbox_embed[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) ref_pts_all = torch.cat( [init_reference[None], inter_references[:, :, :, :2]], dim=0) out = { 'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'ref_pts': ref_pts_all[5] } if self.aux_loss: out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) with torch.no_grad(): if self.training: track_scores = outputs_class[-1, 0, :].sigmoid().max(dim=-1).values else: track_scores = outputs_class[-1, 0, :, 0].sigmoid() track_instances.scores = track_scores track_instances.pred_logits = outputs_class[-1, 0] track_instances.pred_boxes = outputs_coord[-1, 0] track_instances.output_embedding = hs[-1, 0] if self.training: # the track id will be assigned by the mather. out['track_instances'] = track_instances track_instances = self.criterion.match_for_single_frame(out) else: # each track will be assigned an unique global id by the track base. self.track_base.update(track_instances) if self.memory_bank is not None: track_instances = self.memory_bank(track_instances) # track_instances.track_scores = track_instances.track_scores[..., 0] # track_instances.scores = track_instances.track_scores.sigmoid() if self.training: self.criterion.calc_loss_for_track_scores(track_instances) tmp = {} tmp['init_track_instances'] = self._generate_empty_tracks() tmp['track_instances'] = track_instances out_track_instances = self.track_embed(tmp) out['track_instances'] = out_track_instances return out