def moc_decode(heat, wh, mov, N=100, K=5): batch, cat, height, width = heat.size() # perform 'nms' on heatmaps heat = _nms(heat) scores, index, classes, ys, xs = _topk(heat, K=N) mov = _tranpose_and_gather_feature(mov, index) mov = mov.view(batch, N, 2 * K) mov_copy = mov.clone() mov_copy = mov_copy.view(batch, N, K, 2) index_all = torch.zeros((batch, N, K, 2)).cuda() xs_all = xs.clone().unsqueeze(2).expand(batch, N, K) ys_all = ys.clone().unsqueeze(2).expand(batch, N, K) xs_all = xs_all + mov_copy[:, :, :, 0] ys_all = ys_all + mov_copy[:, :, :, 1] xs_all[:, :, K // 2] = xs ys_all[:, :, K // 2] = ys xs_all = xs_all.long() ys_all = ys_all.long() index_all[:, :, :, 0] = xs_all + ys_all * width index_all[:, :, :, 1] = xs_all + ys_all * width index_all[index_all < 0] = 0 index_all[index_all > width * height - 1] = width * height - 1 index_all = index_all.view(batch, N, K * 2).long() # gather wh in each location after movement wh = _tranpose_and_gather_feature(wh, index, index_all=index_all) wh = wh.view(batch, N, 2 * K) classes = classes.view(batch, N, 1).float() scores = scores.view(batch, N, 1) xs = xs.view(batch, N, 1) ys = ys.view(batch, N, 1) bboxes = [] for i in range(K): bboxes.extend([xs + mov[..., 2 * i:2 * i + 1] - wh[..., 2 * i:2 * i + 1] / 2, ys + mov[..., 2 * i + 1:2 * i + 2] - wh[..., 2 * i + 1:2 * i + 2] / 2, xs + mov[..., 2 * i:2 * i + 1] + wh[..., 2 * i:2 * i + 1] / 2, ys + mov[..., 2 * i + 1:2 * i + 2] + wh[..., 2 * i + 1:2 * i + 2] / 2]) bboxes = torch.cat(bboxes, dim=2) detections = torch.cat([bboxes, scores, classes], dim=2) return detections
def forward(self, output, mask, index, target, index_all=None): pred = _tranpose_and_gather_feature(output, index, index_all=index_all) # pred --> b, N, 2*K # mask --> b, N ---> b, N, 2*K mask = mask.unsqueeze(2).expand_as(pred).float() # print(pred.shape) # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') loss = F.l1_loss(pred * mask, target * mask, size_average=False) loss = loss / (mask.sum() + 1e-4) return loss
def forward(self, output, mask, index, target, index_all=None): # TODO: HARDCODED #if index_all is not None: #index_all = index_all[:,:,4:] pred = _tranpose_and_gather_feature(output, index, index_all=index_all) # pred --> b, N, 2*K # mask --> b, N ---> b, N, 2*K mask = mask.unsqueeze(2).expand_as(pred).float() # print(pred.shape) # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') # TODO: HARDCODED #target_ = target[:,:,4:] when not all gt need to be considered loss = F.l1_loss(pred * mask, target * mask, size_average=False) loss = loss / (mask.sum() + 1e-4) return loss
def moc_decode_multihm(heat, wh, mov, N=100, K=5): batch, cat, height, width = heat.size() # perform 'nms' on heatmaps #heat = _nms(heat) scores_, index_, classes_, ys_, xs_ = [], [], [], [], [] for i in range(K): #i = 0 heat_i = heat[:, i * 21:(i + 1) * 21, :, :] heat_i = _nms(heat_i) scores, index, classes, ys, xs = _topN( heat_i, N=N) # TODO: 21 is hardcoded for now scores_.append(scores) index_.append(index) classes_.append(classes) ys_.append(ys) xs_.append(xs) # ATTENTION: cat then view gave strange output ... xs_all = torch.stack( xs_, dim=2) # xs_all = xs.clone().unsqueeze(2).expand(batch, N, K) ys_all = torch.stack(ys_, dim=2) ''' xs_all0 = xs_[0].clone().unsqueeze(2).view(batch, N, 1) ys_all0 = ys_[0].clone().unsqueeze(2).view(batch, N, 1) xs_all1 = xs_[1].clone().unsqueeze(2).view(batch, N, 1) ys_all1 = ys_[1].clone().unsqueeze(2).view(batch, N, 1) xs_all2 = xs_[2].clone().unsqueeze(2).view(batch, N, 1) ys_all2 = ys_[2].clone().unsqueeze(2).view(batch, N, 1) xs_all3 = xs_[3].clone().unsqueeze(2).view(batch, N, 1) ys_all3 = ys_[3].clone().unsqueeze(2).view(batch, N, 1) xs_all4 = xs_[4].clone().unsqueeze(2).view(batch, N, 1) ys_all4 = ys_[4].clone().unsqueeze(2).view(batch, N, 1) xs_all = torch.cat((xs_all0, xs_all1, xs_all2, xs_all3, xs_all4), dim=2) ys_all = torch.cat((ys_all0, ys_all1, ys_all2, ys_all3, ys_all4), dim=2) ''' classes = classes_[K // 2] scores = scores_[K // 2] ''' mov = _tranpose_and_gather_feature(mov, index) # torch.Size([16, 100, 6]) mov = mov.view(batch, N, 2 * K) # reshape is redundant? mov_copy = mov.clone() mov_copy = mov_copy.view(batch, N, K, 2) xs_all = xs.clone().unsqueeze(2).expand(batch, N, K) ys_all = ys.clone().unsqueeze(2).expand(batch, N, K) ''' #xs_all = xs_all + mov_copy[:, :, :, 0] #ys_all = ys_all + mov_copy[:, :, :, 1] #xs_all[:, :, K // 2] = xs # center frame no movement #ys_all[:, :, K // 2] = ys # Essential conversion! (why?) xs_all_f = xs_all.clone() ys_all_f = ys_all.clone() xs_all = xs_all.long() ys_all = ys_all.long() index_all = torch.zeros((batch, N, K, 2)).cuda() index_all[:, :, :, 0] = xs_all + ys_all * width index_all[:, :, :, 1] = xs_all + ys_all * width index_all[index_all < 0] = 0 index_all[index_all > width * height - 1] = width * height - 1 index_all = index_all.view(batch, N, K * 2).long() # gather wh in each location after movement wh = _tranpose_and_gather_feature(wh, index, index_all=index_all) wh = wh.view(batch, N, 2 * K) classes = classes.view(batch, N, 1).float() scores = scores.view(batch, N, 1) xs = xs.view(batch, N, 1) ys = ys.view(batch, N, 1) bboxes = [] ''' # ORIG: with mov for i in range(K): bboxes.extend([xs + mov[..., 2 * i:2 * i + 1] - wh[..., 2 * i:2 * i + 1] / 2, ys + mov[..., 2 * i + 1:2 * i + 2] - wh[..., 2 * i + 1:2 * i + 2] / 2, xs + mov[..., 2 * i:2 * i + 1] + wh[..., 2 * i:2 * i + 1] / 2, ys + mov[..., 2 * i + 1:2 * i + 2] + wh[..., 2 * i + 1:2 * i + 2] / 2]) ''' for i in range(K): bboxes.extend([ xs_all_f[:, :, i].unsqueeze(2) - wh[..., 2 * i:2 * i + 1] / 2, ys_all_f[:, :, i].unsqueeze(2) - wh[..., 2 * i + 1:2 * i + 2] / 2, xs_all_f[:, :, i].unsqueeze(2) + wh[..., 2 * i:2 * i + 1] / 2, ys_all_f[:, :, i].unsqueeze(2) + wh[..., 2 * i + 1:2 * i + 2] / 2 ]) bboxes = torch.cat(bboxes, dim=2) detections = torch.cat([bboxes, scores, classes], dim=2) return detections
def moc_decode(heat, wh, mov, N=100, K=5): #N=40 batch, cat, height, width = heat.size() # perform 'nms' on heatmaps heat = _nms(heat) scores, index, classes, ys, xs = _topN(heat, N=N) # torch.Size([16, 100]) # ORIG: MOC center branch mov = _tranpose_and_gather_feature(mov, index) # torch.Size([16, 100, 6]) mov = mov.view(batch, N, 2 * K) # reshape is redundant? mov_copy = mov.clone() mov_copy = mov_copy.view(batch, N, K, 2) index_all = torch.zeros((batch, N, K, 2)).cuda() xs_all = xs.clone().unsqueeze(2).expand(batch, N, K) ys_all = ys.clone().unsqueeze(2).expand(batch, N, K) xs_all = xs_all + mov_copy[:, :, :, 0] ys_all = ys_all + mov_copy[:, :, :, 1] #xs_all[:, :, K // 2] = xs # center frame no movement #ys_all[:, :, K // 2] = ys xs_all[:, :, K - 1] = xs # last frame no movement ys_all[:, :, K - 1] = ys xs_all = xs_all.long() ys_all = ys_all.long() index_all[:, :, :, 0] = xs_all + ys_all * width index_all[:, :, :, 1] = xs_all + ys_all * width index_all[index_all < 0] = 0 index_all[index_all > width * height - 1] = width * height - 1 index_all = index_all.view(batch, N, K * 2).long() # gather wh in each location after movement wh = _tranpose_and_gather_feature(wh, index, index_all=index_all) wh = wh.view(batch, N, 2 * K) ''' # ADDED: CenterNet offset for individual frame # gather mov in each location mov = _tranpose_and_gather_feature(mov, index, index_all=index_all) mov = mov.view(batch, N, 2 * K) ''' classes = classes.view(batch, N, 1).float() scores = scores.view(batch, N, 1) xs = xs.view(batch, N, 1) ys = ys.view(batch, N, 1) bboxes = [] # ORIG: with mov for i in range(K): bboxes.extend([ xs + mov[..., 2 * i:2 * i + 1] - wh[..., 2 * i:2 * i + 1] / 2, ys + mov[..., 2 * i + 1:2 * i + 2] - wh[..., 2 * i + 1:2 * i + 2] / 2, xs + mov[..., 2 * i:2 * i + 1] + wh[..., 2 * i:2 * i + 1] / 2, ys + mov[..., 2 * i + 1:2 * i + 2] + wh[..., 2 * i + 1:2 * i + 2] / 2 ]) ''' for i in range(K): bboxes.extend([xs - wh[..., 2 * i:2 * i + 1] / 2, ys - wh[..., 2 * i + 1:2 * i + 2] / 2, xs + wh[..., 2 * i:2 * i + 1] / 2, ys + wh[..., 2 * i + 1:2 * i + 2] / 2]) ''' bboxes = torch.cat(bboxes, dim=2) detections = torch.cat([bboxes, scores, classes], dim=2) return detections