def forward(self, im_data, im_info, gt_boxes=None, gt_ishard=None, dontcare_areas=None): im_data = network.np_to_variable(im_data, is_cuda=True) im_data = im_data.permute(0, 3, 1, 2) features = self.features(im_data) rpn_conv1 = self.conv1(features) # rpn score rpn_cls_score = self.score_conv(rpn_conv1) rpn_cls_score_reshape = self.reshape_layer(rpn_cls_score, 2) rpn_cls_prob = F.softmax(rpn_cls_score_reshape) rpn_cls_prob_reshape = self.reshape_layer( rpn_cls_prob, len(self.anchor_scales) * 3 * 2) # rpn boxes rpn_bbox_pred = self.bbox_conv(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, self._feat_stride, self.anchor_scales) # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.anchor_target_layer(rpn_cls_score, gt_boxes, gt_ishard, dontcare_areas, im_info, self._feat_stride, self.anchor_scales) self.cross_entropy, self.loss_box = self.build_loss( rpn_cls_score_reshape, rpn_bbox_pred, rpn_data) return features, rois
def compute_value(self, value, relation, geometry_weight, pooled_features): relation = relation.data.cpu().numpy() geometry_weight = np.squeeze(geometry_weight.data.cpu().numpy()) pooled_features = pooled_features.data.cpu().numpy() value = value.data.cpu().numpy() count = relation.shape[1] weight = np.zeros([self.Nr, count, count]) add_feature = np.zeros([self.Nr, count, self.d_v]) weight[0][:, :] = self.p_softmax(relation[0][:, :], geometry_weight[:, 0]) add_feature[0][:, :] = np.dot(weight[0][:, :], value[:, 0 * self.Nr:(0 + 1) * self.Nr]) feature = add_feature[0][:, :] for i in range(1, self.Nr): weight[i][:, :] = self.p_softmax(relation[i][:, :], geometry_weight[:, i]) add_feature[i][:, :] = np.dot( weight[i][:, :], value[:, i * self.Nr:(i + 1) * self.Nr]) feature = np.hstack([feature, add_feature[i][:, :]]) feature += pooled_features feature = np_to_variable(feature, is_cuda=True) return feature
def forward(self, roidb, vcoco_ann): # TODO freeze the RPN? If so, why? # TODO what is purpose of the non-gt roi stuff? # TODO do I need to change this to use non-gt-rois??? probably. # Also note that the system in HOI paper trained on both RPN proposals # and GT proposals. We're training on GT and some random stuff... ? # Let's punt for now. assert len(roidb) == 1, "Invalid len(roidb) > 1" # This code requires it assert cfg.TRAIN.FG_FRACTION == 0.25 assert cfg.TRAIN.FG_THRESH == 0.5 assert cfg.TRAIN.HAS_RPN, "Training this model requires an RPN" """ Get a {"name": Loss} mapping from a given x,y datapoint. The losses will later be summed, but it's convenient to store them individually for logging purposes. """ ret = {} blobs = get_minibatch(roidb, len(self.model.detection_branch.classes)) #def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): im_data = blobs['data'] im_info = blobs['im_info'] gt_boxes = blobs['gt_boxes'] gt_ishard = blobs['gt_ishard'] dontcare_areas = blobs['dontcare_areas'] # Get cross-entropy and box loss for rpn and faster-rcnn networks # Since the RPN is in training mode, this will create rois that are # partly from GT and partly from RPN. cls_prob, bbox_pred, rois, features, rpn_ce, rpn_lb, f_ce, f_lb, \ roi_data = \ faster_rcnn_forward( self.model.detection_branch, im_data, im_info, gt_boxes, gt_ishard, dontcare_areas) ret.update({ "rpn_ce": rpn_ce, "rpn_lb": rpn_lb, "f_ce": f_ce, "f_lb": f_lb}) # TODO normally, we will get ROIs from elsewhere. When that happens, # move this code. # Desire: image, gt boxes w/ class labels, roi boxes with max overlap # classes. #import pdb; pdb.set_trace() """ self.visualizer.visualize_samples( im_data, roidb[0]["gt_classes"], roidb[0]["gt_overlaps"], gt_boxes[:, 0:4]) """ # Find human boxes that have >= 0.5 overlap with gt # RB has elements; want rb[0]['gt_boxes'] # TODO these person_indexes are empty (?) """ # TODO sad old confused code. person_index = self.vcoco_translator.nouns_2_ids["person"] elem = roidb[0] candidate_persons = np.where(np.logical_and( elem["gt_classes"] == person_index, elem["gt_overlaps"][:, person_index] > 0.5)) # TODO b_h is empty... # TODO this data that we're feeding is WAY wrong. the filename # corresponds to a picture of a surfer, labels show airplanes... b_h = elem["boxes"][candidate_persons] try: # TODO is this causing an error? np.random.shuffle(b_h) except Exception as e: import pdb; pdb.set_trace() b_h = b_h[:16] # only choose 16 boxes. b_h = np.array([[1., 1., 2., 2.]]) # TODO :/ the candidate person boxes # are not found. So I'm hallucinating these values for now :( b_h = network.np_to_variable(b_h) """ # roi_data consists of: # rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights person_index = self.vcoco_translator.nouns_2_ids["person"] person_roi_indices = roi_data[1] == person_index # Choose at most 16 people nonz = torch.nonzero(person_roi_indices.data.squeeze().cpu()) \ .squeeze().numpy() if nonz.size != 0: choices = np.random.choice(nonz, min(nonz.size, 16), replace=False) person_roi_indices = torch.from_numpy(choices).cuda(0) #person_roi_indices = person_roi_indices[choices] b_h = rois[person_roi_indices] action_scores, action_locations = self.model.human_centric_branch( b_h, features) # Get ground-truth and calculate loss. # This is [B=1 x NActions] gt_action_scores = self.vcoco_translator.get_action_labels(vcoco_ann) gt_action_scores = network.np_to_variable(gt_action_scores) gt_action_scores = gt_action_scores.unsqueeze(0).expand_as( action_scores) action_ce = F.binary_cross_entropy(action_scores, gt_action_scores) ret.update({ "action_ce": action_ce, }) # Get ground-truth role locations for non-agent roles. # It will be a [B=1 x NActionNonagentRoles x 5] structure] # The actions for 2-obj action things are treated uniquely. # TODO we probably want to make the GT labels relative to the agent? gt_action_locations = \ self.vcoco_translator.get_action_nonagent_role_locations( vcoco_ann) gt_action_locations = gt_action_locations.squeeze(0) # (it's a np.ndarray with size [1 x NActionRolesNonagent x 4]) # Choose the action locations that correspond to a ground-truth action chosen_locations = np.where(np.logical_and( gt_action_locations[:, 0] == 1, np.logical_not(np.isnan(gt_action_locations[:, 1])))) assert len(chosen_locations) == 1, "Expected size-1 tuple" gt_action_locations = gt_action_locations[chosen_locations[0], 1:] if gt_action_locations.size != 0: try: print "SIZE IS: ", gt_action_locations.size gt_action_locations = network.np_to_variable( gt_action_locations).unsqueeze(0) action_locations = action_locations.cpu().data.numpy() action_locations = action_locations[:, chosen_locations[0], :] action_locations = network.np_to_variable(action_locations) # Expand in the batch dimension. gt_action_locations = gt_action_locations.expand_as(action_locations) # It's possible that there are no actions with localized information. if gt_action_locations.dim() != 0: location_l1 = F.smooth_l1_loss( action_locations, gt_action_locations) ret.update({ "location_l1": location_l1, }) except: import pdb; pdb.set_trace() # TODO continue here with the editing / debugging. # TODO the last part is confusing. I'll take it to mean that b_h and b_o # must both be taken from ground truth labels. # (But possibly, they mean that only the cases where the action has a # positive label for those boxes) # Get the gT human box. # TODO consider removing this part of the system...? # TODO this will give another gradient to human branch... ??? # gt_actions will probably be one-hot along each row for those # interactions? Or not...; can just expand gt_action_scores from above. b_h, b_o, gt_actions = self.vcoco_translator.get_human_object_gt_pairs( vcoco_ann) if b_h is not None: b_h, b_o, gt_actions = map( network.np_to_variable, [b_h, b_o, gt_actions]) h_action_scores, _ = self.model.human_centric_branch( b_h, features) h_action_scores = \ self.vcoco_translator.human_scores_to_agentrolenonagent( h_action_scores.cpu().data.numpy()) h_action_scores = network.np_to_variable(h_action_scores) scores = self.model.interaction_branch(h_action_scores, b_o, features) interaction_ce = F.binary_cross_entropy(scores, gt_actions) ret.update({"interaction_ce": interaction_ce}) self.log_values(ret) loss = sum(ret.itervalues()) return loss
def forward(self, im_data, im_info, gt_regions=None, use_beam_search=False, graph_generation=False): self.training = False self.timer.tic() features, region_rois = self.rpn(im_data, im_info, gt_regions=gt_regions) # if not self.training and gt_objects is not None: # zeros = np.zeros((gt_objects.shape[0], 1), dtype=gt_objects.dtype) # object_rois_gt = np.hstack((zeros, gt_objects[:, :4])) # object_rois_gt = network.np_to_variable(object_rois_gt, is_cuda=True) # object_rois[:object_rois_gt.size(0)] = object_rois_gt if not self.training and gt_regions is not None: zeros = np.zeros((gt_regions.shape[0], 1), dtype=gt_regions.dtype) region_rois = np.hstack((zeros, gt_regions[:, :4])) region_rois = network.np_to_variable(region_rois, is_cuda=True) # print 'region_rois[gt]:', region_rois # print 'object_rois.shape', object_rois.size() # print 'features.std' # print features.data.std() if TIME_IT: torch.cuda.synchronize() print '\t[RPN]: %.3fs' % self.timer.toc(average=False) self.timer.tic() roi_data_region = \ self.proposal_target_layer( region_rois, gt_regions, self.n_classes_obj, self.voc_sign, self.training, graph_generation=graph_generation) if TIME_IT: torch.cuda.synchronize() print '\t[Proposal]: %.3fs' % self.timer.toc(average=False) self.timer.tic() #object_rois = roi_data_object[0] #phrase_rois = roi_data_predicate[0] region_rois = roi_data_region[0] # print 'object_rois_num: {}'.format(object_rois.size()[0]) # print 'phrase_rois_num: {}'.format(phrase_rois.size()[0]) # print 'region_rois_num: {}'.format(region_rois.size()[0]) # roi pool # pooled_object_features = self.roi_pool_object(features, object_rois) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[object_pooling]: %.3fs' % self.timer.toc(average=False) # #print 'pool5_object.std' # #print pooled_object_features.data.std() # pooled_object_features = pooled_object_features.view(pooled_object_features.size()[0], -1) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[object_feature_view]: %.3fs' % self.timer.toc(average=False) # pooled_object_features = self.fc6_obj(pooled_object_features) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[object_feature_fc6]: %.3fs' % self.timer.toc(average=False) # if self.dropout: # pooled_object_features = F.dropout(pooled_object_features, training = self.training) # #print 'fc6_object.std' # #print pooled_object_features.data.std() # pooled_object_features = self.fc7_obj(pooled_object_features) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[object_feature_fc7]: %.3fs' % self.timer.toc(average=False) # if self.dropout: # pooled_object_features = F.dropout(pooled_object_features, training = self.training) # #print 'fc7_object.std' # #print pooled_object_features.data.std() # # pooled_phrase_features = self.roi_pool_phrase(features, phrase_rois) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[phrase_pooling]: %.3fs' % self.timer.toc(average=False) # #print 'pool5_phrase.std' # #print pooled_phrase_features.data.std() # pooled_phrase_features = pooled_phrase_features.view(pooled_phrase_features.size()[0], -1) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[phrase_feature_view]: %.3fs' % self.timer.toc(average=False) # pooled_phrase_features = self.fc6_phrase(pooled_phrase_features) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[phrase_feature_fc6]: %.3fs' % self.timer.toc(average=False) # if self.dropout: # pooled_phrase_features = F.dropout(pooled_phrase_features, training = self.training) # #print 'fc6_phrase.std' # #print pooled_phrase_features.data.std() # pooled_phrase_features = self.fc7_phrase(pooled_phrase_features) # if TIME_IT: # torch.cuda.synchronize() # print '\t\t[phrase_feature_fc7]: %.3fs' % self.timer.toc(average=False) # if self.dropout: # pooled_phrase_features = F.dropout(pooled_phrase_features, training = self.training) # #print 'fc7_phrase.std' # #print pooled_phrase_features.data.std() pooled_region_features = self.roi_pool_region(features, region_rois) if TIME_IT: torch.cuda.synchronize() print '\t\t[region_pooling]: %.3fs' % self.timer.toc(average=False) #print 'pool5_region.std' #print pooled_region_features.data.std() pooled_region_features = pooled_region_features.view(pooled_region_features.size()[0], -1) if TIME_IT: torch.cuda.synchronize() print '\t\t[region_feature_view]: %.3fs' % self.timer.toc(average=False) pooled_region_features = self.fc6_region(pooled_region_features) if TIME_IT: torch.cuda.synchronize() print '\t\t[region_feature_fc6]: %.3fs' % self.timer.toc(average=False) if self.dropout: pooled_region_features = F.dropout(pooled_region_features, training = self.training) #print 'fc6_region.std' #print pooled_region_features.data.std() pooled_region_features = self.fc7_region(pooled_region_features) if TIME_IT: torch.cuda.synchronize() print '\t\t[region_feature_fc7]: %.3fs' % self.timer.toc(average=False) if self.dropout: pooled_region_features = F.dropout(pooled_region_features, training = self.training) #print 'fc7_region.std' #print pooled_region_features.data.std() # print 'pre_mps_object.std', pooled_object_features.data.std() # print 'pre_mps_phrase.std', pooled_phrase_features.data.std() # print 'pre_mps_region.std', pooled_region_features.data.std() # bounding box regression before message passing #bbox_object = self.bbox_obj(F.relu(pooled_object_features)) #if self.use_region_reg: bbox_region = self.bbox_region(F.relu(pooled_region_features)) if TIME_IT: torch.cuda.synchronize() print '\t[Pre-MPS]: %.3fs' % self.timer.toc(average=False) self.timer.tic() # hierarchical message passing structure # if self.MPS_iter < 0: # if self.training: # self.MPS_iter = npr.choice(self.MPS_iter_range) # else: # self.MPS_iter = cfg.TEST.MPS_ITER_NUM # for i in xrange(self.MPS_iter): # pooled_object_features, pooled_phrase_features, pooled_region_features = \ # self.mps(pooled_object_features, pooled_phrase_features, pooled_region_features, \ # mat_object, mat_phrase, mat_region) if TIME_IT: torch.cuda.synchronize() print '\t[Passing]: %.3fs' % self.timer.toc(average=False) # print 'post_mps_object.std', pooled_object_features.data.std() # print 'post_mps_phrase.std', pooled_phrase_features.data.std() # print 'post_mps_region.std', pooled_region_features.data.std() self.timer.tic() # pooled_object_features = F.relu(pooled_object_features) # pooled_phrase_features = F.relu(pooled_phrase_features) pooled_region_features = F.relu(pooled_region_features) # cls_score_object = self.score_obj(pooled_object_features) # cls_prob_object = F.softmax(cls_score_object) # # cls_score_predicate = self.score_pred(pooled_phrase_features) # cls_prob_predicate = F.softmax(cls_score_predicate) # # if not self.use_region_reg: bbox_region = Variable(torch.zeros(pooled_region_features.size(0), 4).cuda()) cls_objectiveness_region = self.objectiveness(pooled_region_features) # print 'cls_score_object.std', cls_score_object.data.std() # print 'cls_pred_box.std', bbox_object.data.std() # print 'cls_score_phrase.std', cls_score_predicate.data.std() if TIME_IT: torch.cuda.synchronize() print '\t[Post-MPS]: %.3fs' % self.timer.toc(average=False) # if DEBUG: # print 'cls_score_predicate' # print cls_score_predicate # print 'roi_data_predicate[1]' # print roi_data_predicate[1] #todo : when doing end to end training, handle following. it has loss_region_box, objectiveness_loss # if self.training: # # # self.cross_entropy_object, self.loss_obj_box = self.build_loss_object(cls_score_object, bbox_object, roi_data_object) # # self.cross_entropy_predicate, self.tp_pred, self.tf_pred, self.fg_cnt_pred, self.bg_cnt_pred = \ # # self.build_loss_cls(cls_score_predicate, roi_data_predicate[1]) # # print 'accuracy: %2.2f%%' % (((self.tp_pred + self.tf_pred) / float(self.fg_cnt_pred + self.bg_cnt_pred)) * 100) # # self.timer.tic() # # if self.use_language_loss: # # self.region_caption_loss = self.caption_prediction(pooled_region_features, roi_data_region[1]) # # else: # # self.region_caption_loss = Variable(torch.zeros(1).cuda()) # # #if self.use_region_reg: # self.loss_region_box = self.build_loss_bbox(bbox_region, roi_data_region) # # print '\t[Caption]: %.3fs' % self.timer.toc(average=False) # region_caption = None # self.objectiveness_loss = self.build_loss_objectiveness(cls_objectiveness_region, \ # roi_data_region[3][:, 0].ne(0).type(torch.cuda.LongTensor)) # else: # # if self.use_language_loss: # # # region_caption, caption_logprobs = self.caption_prediction.beamsearch(pooled_region_features, 10) # # if use_beam_search: # # search_func = self.caption_prediction.beamsearch # # else: # # search_func = self.caption_prediction.baseline_search # # region_caption = search_func(pooled_region_features, 5) # # # pdb.set_trace() # # else: # # region_caption = None # # caption_logprobs = None # # caption_logprobs = F.log_softmax(cls_objectiveness_region)[:, 1].squeeze().cpu().data #return (region_caption, bbox_region, region_rois, caption_logprobs) return pooled_region_features
def forward(self, im_data, im_info, gt_boxes=None, gt_ishard=None, dontcare_areas=None): """ :param im_data: (1, 600, 800, 3) numpy :param im_info: (1, 3) numpy :param gt_boxes: :param gt_ishard: :param dontcare_areas: :return: feature (1, 512, 37, 50) tensor roi (proposals, 5) tensor """ # im_data (1, 3, 600, 800) tensor im_data = network.np_to_variable(im_data, is_cuda=True) im_data = im_data.permute(0, 3, 1, 2) # (1, 512, 37, 50) features = self.features(im_data) rpn_conv1 = self.conv1(features) # ========================================================================== # rpn score (1, 9_anchors*2, 37, 50) # (1, 9_anchors*2, 37, 50) rpn_cls_score = self.score_conv(rpn_conv1) # do softmax to 2-way prob # (1, 2, 9*37, 50) rpn_cls_score_reshape = self.reshape_layer(rpn_cls_score, 2) rpn_cls_prob = F.softmax(rpn_cls_score_reshape, dim=1) # (1, 2*9, 37, 50) rpn_cls_prob_reshape = self.reshape_layer( rpn_cls_prob, len(self.anchor_scales) * 3 * 2) # ========================================================================= # rpn boxes (1, 4*9_anchors, 37, 50) rpn_bbox_pred = self.bbox_conv(rpn_conv1) # ============================================================================= # proposal layer # (proposals, 5) cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, self._feat_stride, self.anchor_scales) # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None # 1. Calc each box's label (bg/fg) # 2. Calc each box's distance to gt box. rpn_data = self.anchor_target_layer(rpn_cls_score, gt_boxes, gt_ishard, dontcare_areas, im_info, self._feat_stride, self.anchor_scales) self.cross_entropy, self.loss_box = self.build_loss( rpn_cls_score_reshape, rpn_bbox_pred, rpn_data) return features, rois