def forward( self, images: torch.Tensor, boxes: torch.Tensor, box_mask: torch.LongTensor, classes: torch.Tensor = None, segms: torch.Tensor = None, ): """ :param images: [batch_size, 3, im_height, im_width] :param boxes: [batch_size, max_num_objects, 4] Padded boxes :param box_mask: [batch_size, max_num_objects] Mask for whether or not each box is OK :return: object reps [batch_size, max_num_objects, dim] """ images = self.pre_backbone(images) images = self.layer2(images) images = self.cvm_2(images) images = self.layer3(images) images = self.cvm_3(images) images = self.layer4(images) img_feats = self.cvm_4(images) box_inds = box_mask.nonzero() assert box_inds.shape[0] > 0 rois = torch.cat(( box_inds[:, 0, None].type(boxes.dtype), boxes[box_inds[:, 0], box_inds[:, 1]], ), 1) # Object class and segmentation representations roi_align_res = self.roi_align(img_feats.float(), rois.float()) if self.mask_upsample is not None: assert segms is not None segms_indexed = segms[box_inds[:, 0], None, box_inds[:, 1]] - 0.5 roi_align_res[:, :self.mask_dims] += self.mask_upsample( segms_indexed) post_roialign = self.after_roi_align(roi_align_res) # Add some regularization, encouraging the model to keep giving decent enough predictions obj_logits = self.regularizing_predictor(post_roialign) obj_labels = classes[box_inds[:, 0], box_inds[:, 1]] cnn_regularization = F.cross_entropy(obj_logits, obj_labels, reduction='mean')[None] feats_to_downsample = post_roialign if self.object_embed is None else torch.cat( (post_roialign, self.object_embed(obj_labels)), -1) roi_aligned_feats = self.obj_downsample(feats_to_downsample) # Reshape into a padded sequence - this is expensive and annoying but easier to implement and debug... obj_reps = pad_sequence(roi_aligned_feats, box_mask.sum(1).tolist()) return { 'obj_reps_raw': post_roialign, 'obj_reps': obj_reps, 'obj_logits': obj_logits, 'obj_labels': obj_labels, 'cnn_regularization_loss': cnn_regularization }
def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]): if isinstance(lens, torch.Tensor): lens = lens.tolist() batch_pred = defaultdict(list) for batch, offset in pred_tags.nonzero(as_tuple=False).tolist(): batch_pred[batch].append(offset) batch_pred_spans = [[(0, l)] for l in lens] for batch, offsets in batch_pred.items(): l = lens[batch] batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l])) return batch_pred_spans
def forward(self, images: torch.Tensor, boxes: torch.Tensor, box_mask: torch.LongTensor, classes: torch.Tensor = None, segms: torch.Tensor = None, ): """ :param images: [batch_size, 3, im_height, im_width] :param boxes: [batch_size, max_num_objects, 4] Padded boxes :param box_mask: [batch_size, max_num_objects] Mask for whether or not each box is OK :return: object reps [batch_size, max_num_objects, dim] """ # [batch_size, 2048, im_height // 32, im_width // 32 img_feats = self.backbone(images) box_inds = box_mask.nonzero() # 为一个二维数组,其中每个元素格式为[ 6, 3],代表该GPU中第7个QA对的第四个box不是非零,也就是说不是填充 assert box_inds.shape[0] > 0 # 确保GPU中至少要有一个box rois = torch.cat(( box_inds[:, 0, None].type(boxes.dtype), boxes[box_inds[:, 0], box_inds[:, 1]], ), 1) # [nbox, 5], 两维数组,第二维第一个元素是box的index,后面的四维是box的坐标 # Object class and segmentation representations roi_align_res = self.roi_align(img_feats, rois) # torch.Size([nbox, 1024, 7, 7]),相当于每一个position都有1024维的表示 if self.mask_upsample is not None: assert segms is not None segms_indexed = segms[box_inds[:, 0], None, box_inds[:, 1]] - 0.5 # 把所有非padding的segmentation罗列出来 [nbox, 1, 14,14] roi_align_res[:, :self.mask_dims] += self.mask_upsample(segms_indexed) # torch.Size([184, 32, 7, 7])。 这里只在前32维加上segmentation的信息 post_roialign = self.after_roi_align(roi_align_res) # torch.Size([nbox, 2048]) # Add some regularization, encouraging the model to keep giving decent enough predictions obj_logits = self.regularizing_predictor(post_roialign) # 类似于faster-RCNN一样,在这里预测该box的类别,以此引入类别的语义信息 obj_labels = classes[box_inds[:, 0], box_inds[:, 1]] cnn_regularization = F.cross_entropy(obj_logits, obj_labels, size_average=True)[None] # tensor([3.2618], device='cuda:1', grad_fn=<UnsqueezeBackward0>) # 这里出现了一个warning,但我觉得没影响 UserWarning: size_average and reduce args will be deprecated, please use reduction='mean' instead. feats_to_downsample = post_roialign if self.object_embed is None else torch.cat((post_roialign, self.object_embed(obj_labels)), -1) # 在原有的2048维视觉信息的基础上,又添加了128维的semantic类别信息 roi_aligned_feats = self.obj_downsample(feats_to_downsample) # print('roi_aligned_feats') # print(roi_aligned_feats) # Reshape into a padded sequence - this is expensive and annoying but easier to implement and debug...这一步就是把这些box表示恢复成batch的格式,padding为全零矩阵 obj_reps = pad_sequence(roi_aligned_feats, box_mask.sum(1).tolist()) return { 'obj_reps_raw': post_roialign, 'obj_reps': obj_reps, 'obj_logits': obj_logits, 'obj_labels': obj_labels, 'cnn_regularization_loss': cnn_regularization }
def activation_loss(x: Tensor, y: LongTensor) -> Tensor: device = x.device pos = y.nonzero().reshape(-1) neg = (y - 1).nonzero().reshape(-1) x0, x1 = x[neg], x[pos] n0, n1 = x0.size(0), x1.size(0) a0_x0 = act(x0, zeros(n0, device)) a1_x0 = act(x0, ones(n0, device)) a1_x1 = act(x1, ones(n1, device)) a0_x1 = act(x1, zeros(n1, device)) neg_loss = (a0_x0 - 1).abs() + a1_x0 pos_loss = (a1_x1 - 1).abs() + a0_x1 return (neg_loss.sum() + pos_loss.sum()) / y.size(0)
def forward( self, img_feats: torch.Tensor, boxes: torch.Tensor, box_mask: torch.LongTensor, obj_labels: torch.LongTensor, ): """ :param images: [batch_size, max_num_objects, 2048] :param boxes: [batch_size, max_num_objects, 7] Padded boxes :param box_mask: [batch_size, max_num_objects] Mask for whether or not each box is OK :return: object reps [batch_size, max_num_objects, dim] """ box_inds = box_mask.nonzero() rois = img_feats[box_inds[:, 0], box_inds[:, 1]] if self.semantic: aligned_obj_labels = obj_labels[box_inds[:, 0], box_inds[:, 1]] rois = torch.cat((rois, self.object_embed(aligned_obj_labels)), -1) roi_aligned_feats = self.ln_f(self.obj_downsample(rois)) if self.use_bbox: bboxes = boxes[box_inds[:, 0], box_inds[:, 1]] box_feats = self.ln_f(self.bbox_upsample(bboxes)) roi_aligned_feats = roi_aligned_feats + box_feats # Add some regularization, encouraging the model to keep giving decent enough predictions # obj_logits = self.regularizing_predictor(roi_aligned_feats) # obj_labels = classes[box_inds[:, 0], box_inds[:, 1]] # cnn_regularization = F.cross_entropy(obj_logits, obj_labels, size_average=True)[None] # Reshape into a padded sequence - this is expensive and annoying but easier to implement and debug... obj_reps = pad_sequence(roi_aligned_feats, box_mask.sum(1).tolist()) return { 'obj_reps': obj_reps, # 'obj_logits': obj_logits, # 'obj_labels': obj_labels, 'cnn_regularization_loss': None # cnn_regularization }
def forward( self, images: torch.Tensor, boxes: torch.Tensor, box_mask: torch.LongTensor, #classes: torch.Tensor = None, #segms: torch.Tensor = None ): """ :param images: [batch_size, 3, im_height, im_width :param boxes: [batch_size, max_num_objects, 4] :param box_mask: [batch_size, max_num_objects] :return: [batch_size, max_num_objects, dim] """ img_feats = self.backbone(images) box_inds = box_mask.nonzero() # [num nonzero, 2] (x, y) indices assert box_inds.shape[0] > 0 # at least 1 masked index rois = torch.cat( ( box_inds[:, 0, None].type(boxes.dtype), # [x * y, 1] boxes[box_inds[:, 0], box_inds[:, 1]] # boxes[x * y, 4] ), 1) # [x * y, 1] + [x * y, 4] -> [x * y, 5] roi_align_res = self.roi_align(img_feats, rois) post_roi_align = self.after_roi_align(roi_align_res) #obj_labels = classes[box_inds[:, 0], box_inds[:, 1]] roi_aligned_feats = self.obj_downsample(post_roi_align) obj_reps = pad_sequence(roi_aligned_feats, box_mask.sum(1).tolist()) return { 'obj_reps_raw': post_roi_align, 'obj_reps': obj_reps, #'obj_labels': obj_labels }