Exemplo n.º 1
0
    def forward(self,
                im_data,
                im_info,
                gt_boxes=None,
                gt_ishard=None,
                dontcare_areas=None):
        im_data = network.np_to_variable(im_data, is_cuda=True)
        im_data = im_data.permute(0, 3, 1, 2)
        features = self.features(im_data)

        rpn_conv1 = self.conv1(features)

        # rpn score
        rpn_cls_score = self.score_conv(rpn_conv1)
        rpn_cls_score_reshape = self.reshape_layer(rpn_cls_score, 2)
        rpn_cls_prob = F.softmax(rpn_cls_score_reshape)
        rpn_cls_prob_reshape = self.reshape_layer(
            rpn_cls_prob,
            len(self.anchor_scales) * 3 * 2)

        # rpn boxes
        rpn_bbox_pred = self.bbox_conv(rpn_conv1)

        # proposal layer
        cfg_key = 'TRAIN' if self.training else 'TEST'
        rois = self.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred,
                                   im_info, cfg_key, self._feat_stride,
                                   self.anchor_scales)

        # generating training labels and build the rpn loss
        if self.training:
            assert gt_boxes is not None
            rpn_data = self.anchor_target_layer(rpn_cls_score, gt_boxes,
                                                gt_ishard, dontcare_areas,
                                                im_info, self._feat_stride,
                                                self.anchor_scales)
            self.cross_entropy, self.loss_box = self.build_loss(
                rpn_cls_score_reshape, rpn_bbox_pred, rpn_data)

        return features, rois
Exemplo n.º 2
0
    def compute_value(self, value, relation, geometry_weight, pooled_features):
        relation = relation.data.cpu().numpy()
        geometry_weight = np.squeeze(geometry_weight.data.cpu().numpy())
        pooled_features = pooled_features.data.cpu().numpy()

        value = value.data.cpu().numpy()
        count = relation.shape[1]
        weight = np.zeros([self.Nr, count, count])
        add_feature = np.zeros([self.Nr, count, self.d_v])
        weight[0][:, :] = self.p_softmax(relation[0][:, :], geometry_weight[:,
                                                                            0])
        add_feature[0][:, :] = np.dot(weight[0][:, :],
                                      value[:, 0 * self.Nr:(0 + 1) * self.Nr])
        feature = add_feature[0][:, :]
        for i in range(1, self.Nr):
            weight[i][:, :] = self.p_softmax(relation[i][:, :],
                                             geometry_weight[:, i])
            add_feature[i][:, :] = np.dot(
                weight[i][:, :], value[:, i * self.Nr:(i + 1) * self.Nr])
            feature = np.hstack([feature, add_feature[i][:, :]])
        feature += pooled_features
        feature = np_to_variable(feature, is_cuda=True)
        return feature
Exemplo n.º 3
0
    def forward(self, roidb, vcoco_ann):
        # TODO freeze the RPN? If so, why?
        # TODO what is purpose of the non-gt roi stuff?
        # TODO do I need to change this to use non-gt-rois??? probably.
        # Also note that the system in HOI paper trained on both RPN proposals
        # and GT proposals. We're training on GT and some random stuff... ?
        # Let's punt for now.
        assert len(roidb) == 1, "Invalid len(roidb) > 1" # This code requires it
        assert cfg.TRAIN.FG_FRACTION == 0.25
        assert cfg.TRAIN.FG_THRESH == 0.5
        assert cfg.TRAIN.HAS_RPN, "Training this model requires an RPN"
        """
        Get a {"name": Loss} mapping from a given x,y datapoint.
        The losses will later be summed, but it's convenient to store them
        individually for logging purposes.
        """
        ret = {}
        blobs = get_minibatch(roidb, len(self.model.detection_branch.classes))
        #def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps):

        im_data = blobs['data']
        im_info = blobs['im_info']
        gt_boxes = blobs['gt_boxes']
        gt_ishard = blobs['gt_ishard']
        dontcare_areas = blobs['dontcare_areas']

        # Get cross-entropy and box loss for rpn and faster-rcnn networks
        # Since the RPN is in training mode, this will create rois that are 
        # partly from GT and partly from RPN.
        cls_prob, bbox_pred, rois, features, rpn_ce, rpn_lb, f_ce, f_lb, \
                roi_data = \
                faster_rcnn_forward(
                        self.model.detection_branch, im_data, im_info, gt_boxes,
                        gt_ishard, dontcare_areas)
        ret.update({
                "rpn_ce": rpn_ce,
                "rpn_lb": rpn_lb,
                "f_ce": f_ce,
                "f_lb": f_lb})

        # TODO normally, we will get ROIs from elsewhere. When that happens,
        # move this code.
        # Desire: image, gt boxes w/ class labels, roi boxes with max overlap
        # classes.
        #import pdb; pdb.set_trace()
        """
        self.visualizer.visualize_samples(
                im_data, 
                roidb[0]["gt_classes"],
                roidb[0]["gt_overlaps"],
                gt_boxes[:, 0:4])
        """

        # Find human boxes that have >= 0.5 overlap with gt
        # RB has elements; want rb[0]['gt_boxes']
        # TODO these person_indexes are empty (?)

        """
        # TODO sad old confused code.
        person_index = self.vcoco_translator.nouns_2_ids["person"]
        elem = roidb[0]
        candidate_persons = np.where(np.logical_and(
                elem["gt_classes"] == person_index,
                elem["gt_overlaps"][:, person_index] > 0.5))
        # TODO b_h is empty...
        # TODO this data that we're feeding is WAY wrong. the filename
        # corresponds to a picture of a surfer, labels show airplanes...
        b_h = elem["boxes"][candidate_persons]
        try:
            # TODO is this causing an error?
            np.random.shuffle(b_h)
        except Exception as e:
            import pdb; pdb.set_trace()
        b_h = b_h[:16] # only choose 16 boxes.
        b_h = np.array([[1., 1., 2., 2.]]) # TODO :/ the candidate person boxes
        # are not found. So I'm hallucinating these values for now :(
        b_h = network.np_to_variable(b_h)
        """

        # roi_data consists of:
        # rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
        person_index = self.vcoco_translator.nouns_2_ids["person"]
        person_roi_indices = roi_data[1] == person_index
        # Choose at most 16 people
        nonz = torch.nonzero(person_roi_indices.data.squeeze().cpu()) \
                .squeeze().numpy()
        if nonz.size != 0:
            choices = np.random.choice(nonz, min(nonz.size, 16), replace=False)
            person_roi_indices = torch.from_numpy(choices).cuda(0)
            #person_roi_indices = person_roi_indices[choices]
            b_h = rois[person_roi_indices]

            action_scores, action_locations = self.model.human_centric_branch(
                    b_h, features)

            # Get ground-truth and calculate loss.
            # This is [B=1 x NActions]
            gt_action_scores = self.vcoco_translator.get_action_labels(vcoco_ann)
            gt_action_scores = network.np_to_variable(gt_action_scores)
            gt_action_scores = gt_action_scores.unsqueeze(0).expand_as(
                    action_scores)

            action_ce = F.binary_cross_entropy(action_scores, gt_action_scores)

            ret.update({
                    "action_ce": action_ce,
                })

        # Get ground-truth role locations for non-agent roles.
        # It will be a [B=1 x NActionNonagentRoles x 5] structure]
        # The actions for 2-obj action things are treated uniquely.
        # TODO we probably want to make the GT labels relative to the agent?
        gt_action_locations = \
                self.vcoco_translator.get_action_nonagent_role_locations(
                        vcoco_ann)
        gt_action_locations = gt_action_locations.squeeze(0)
        # (it's a np.ndarray with size [1 x NActionRolesNonagent x 4])
        # Choose the action locations that correspond to a ground-truth action
        chosen_locations = np.where(np.logical_and(
                gt_action_locations[:, 0] == 1,
                np.logical_not(np.isnan(gt_action_locations[:, 1]))))
        assert len(chosen_locations) == 1, "Expected size-1 tuple"
        gt_action_locations = gt_action_locations[chosen_locations[0], 1:]

        if gt_action_locations.size != 0:
            try:
                print "SIZE IS: ", gt_action_locations.size
                gt_action_locations = network.np_to_variable(
                        gt_action_locations).unsqueeze(0)

                action_locations = action_locations.cpu().data.numpy()
                action_locations = action_locations[:, chosen_locations[0], :]
                action_locations = network.np_to_variable(action_locations)

                # Expand in the batch dimension.
                gt_action_locations = gt_action_locations.expand_as(action_locations)

                # It's possible that there are no actions with localized information.
                if gt_action_locations.dim() != 0:
                    location_l1 = F.smooth_l1_loss(
                            action_locations, gt_action_locations)

                ret.update({
                        "location_l1": location_l1,
                    })
            except:
                import pdb; pdb.set_trace()

        # TODO continue here with the editing / debugging.

        # TODO the last part is confusing. I'll take it to mean that b_h and b_o
        # must both be taken from ground truth labels.
        # (But possibly, they mean that only the cases where the action has a
        # positive label for those boxes)
        # Get the gT human box.
        # TODO consider removing this part of the system...?
        # TODO this will give another gradient to human branch... ???
        # gt_actions will probably be one-hot along each row for those
        # interactions? Or not...; can just expand gt_action_scores from above.
        b_h, b_o, gt_actions = self.vcoco_translator.get_human_object_gt_pairs(
                vcoco_ann)
        if b_h is not None:
            b_h, b_o, gt_actions = map(
                    network.np_to_variable, [b_h, b_o, gt_actions])
            h_action_scores, _ = self.model.human_centric_branch(
                    b_h, features)
            h_action_scores = \
                    self.vcoco_translator.human_scores_to_agentrolenonagent(
                            h_action_scores.cpu().data.numpy())
            h_action_scores = network.np_to_variable(h_action_scores)
            scores = self.model.interaction_branch(h_action_scores, b_o, features)

            interaction_ce = F.binary_cross_entropy(scores, gt_actions)

            ret.update({"interaction_ce": interaction_ce})

        self.log_values(ret)

        loss = sum(ret.itervalues())
        return loss
Exemplo n.º 4
0
    def forward(self, im_data, im_info, gt_regions=None,
                use_beam_search=False, graph_generation=False):

        self.training = False
        self.timer.tic()
        features, region_rois = self.rpn(im_data, im_info, gt_regions=gt_regions)

        # if not self.training and gt_objects is not None:
        #     zeros = np.zeros((gt_objects.shape[0], 1), dtype=gt_objects.dtype)
        #     object_rois_gt = np.hstack((zeros, gt_objects[:, :4]))
        #     object_rois_gt = network.np_to_variable(object_rois_gt, is_cuda=True)
        #     object_rois[:object_rois_gt.size(0)] = object_rois_gt


        if not self.training and gt_regions is not None:
            zeros = np.zeros((gt_regions.shape[0], 1), dtype=gt_regions.dtype)
            region_rois = np.hstack((zeros, gt_regions[:, :4]))
            region_rois = network.np_to_variable(region_rois, is_cuda=True)
            # print 'region_rois[gt]:', region_rois


        # print 'object_rois.shape', object_rois.size()

        # print 'features.std'
        # print features.data.std()
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t[RPN]:      %.3fs' % self.timer.toc(average=False)


        self.timer.tic()
        roi_data_region = \
            self.proposal_target_layer( region_rois,  gt_regions,
                                        self.n_classes_obj, self.voc_sign, self.training, graph_generation=graph_generation)
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t[Proposal]: %.3fs' % self.timer.toc(average=False)


        self.timer.tic()
        #object_rois = roi_data_object[0]
        #phrase_rois = roi_data_predicate[0]
        region_rois = roi_data_region[0]

        # print 'object_rois_num: {}'.format(object_rois.size()[0])
        # print 'phrase_rois_num: {}'.format(phrase_rois.size()[0])
        # print 'region_rois_num: {}'.format(region_rois.size()[0])

        # roi pool
        # pooled_object_features = self.roi_pool_object(features, object_rois)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[object_pooling]: %.3fs' % self.timer.toc(average=False)
        # #print 'pool5_object.std'
        # #print pooled_object_features.data.std()
        # pooled_object_features = pooled_object_features.view(pooled_object_features.size()[0], -1)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[object_feature_view]: %.3fs' % self.timer.toc(average=False)
        # pooled_object_features = self.fc6_obj(pooled_object_features)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[object_feature_fc6]: %.3fs' % self.timer.toc(average=False)
        # if self.dropout:
        #     pooled_object_features = F.dropout(pooled_object_features, training = self.training)
        # #print 'fc6_object.std'
        # #print pooled_object_features.data.std()
        # pooled_object_features = self.fc7_obj(pooled_object_features)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[object_feature_fc7]: %.3fs' % self.timer.toc(average=False)
        # if self.dropout:
        #     pooled_object_features = F.dropout(pooled_object_features, training = self.training)
        # #print 'fc7_object.std'
        # #print pooled_object_features.data.std()
        #
        # pooled_phrase_features = self.roi_pool_phrase(features, phrase_rois)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[phrase_pooling]: %.3fs' % self.timer.toc(average=False)
        # #print 'pool5_phrase.std'
        # #print pooled_phrase_features.data.std()
        # pooled_phrase_features = pooled_phrase_features.view(pooled_phrase_features.size()[0], -1)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[phrase_feature_view]: %.3fs' % self.timer.toc(average=False)
        # pooled_phrase_features = self.fc6_phrase(pooled_phrase_features)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[phrase_feature_fc6]: %.3fs' % self.timer.toc(average=False)
        # if self.dropout:
        #     pooled_phrase_features = F.dropout(pooled_phrase_features, training = self.training)
        # #print 'fc6_phrase.std'
        # #print pooled_phrase_features.data.std()
        # pooled_phrase_features = self.fc7_phrase(pooled_phrase_features)
        # if TIME_IT:
        #     torch.cuda.synchronize()
        #     print '\t\t[phrase_feature_fc7]: %.3fs' % self.timer.toc(average=False)
        # if self.dropout:
        #     pooled_phrase_features = F.dropout(pooled_phrase_features, training = self.training)
        # #print 'fc7_phrase.std'
        # #print pooled_phrase_features.data.std()

        pooled_region_features = self.roi_pool_region(features, region_rois)
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t\t[region_pooling]: %.3fs' % self.timer.toc(average=False)
        #print 'pool5_region.std'
        #print pooled_region_features.data.std()
        pooled_region_features = pooled_region_features.view(pooled_region_features.size()[0], -1)
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t\t[region_feature_view]: %.3fs' % self.timer.toc(average=False)
        pooled_region_features = self.fc6_region(pooled_region_features)
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t\t[region_feature_fc6]: %.3fs' % self.timer.toc(average=False)
        if self.dropout:
            pooled_region_features = F.dropout(pooled_region_features, training = self.training)
        #print 'fc6_region.std'
        #print pooled_region_features.data.std()
        pooled_region_features = self.fc7_region(pooled_region_features)
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t\t[region_feature_fc7]: %.3fs' % self.timer.toc(average=False)
        if self.dropout:
            pooled_region_features = F.dropout(pooled_region_features, training = self.training)
        #print 'fc7_region.std'
        #print pooled_region_features.data.std()

        # print 'pre_mps_object.std', pooled_object_features.data.std()
        # print 'pre_mps_phrase.std', pooled_phrase_features.data.std()
        # print 'pre_mps_region.std', pooled_region_features.data.std()

        # bounding box regression before message passing
        #bbox_object = self.bbox_obj(F.relu(pooled_object_features))

        #if self.use_region_reg:
        bbox_region = self.bbox_region(F.relu(pooled_region_features))

        if TIME_IT:
            torch.cuda.synchronize()
            print '\t[Pre-MPS]:  %.3fs' % self.timer.toc(average=False)

        self.timer.tic()
        # hierarchical message passing structure
        # if self.MPS_iter < 0:
        #     if self.training:
        #         self.MPS_iter = npr.choice(self.MPS_iter_range)
        #     else:
        #         self.MPS_iter = cfg.TEST.MPS_ITER_NUM

        # for i in xrange(self.MPS_iter):
        #     pooled_object_features, pooled_phrase_features, pooled_region_features = \
        #         self.mps(pooled_object_features, pooled_phrase_features, pooled_region_features, \
        #                     mat_object, mat_phrase, mat_region)
        if TIME_IT:
            torch.cuda.synchronize()
            print '\t[Passing]:  %.3fs' % self.timer.toc(average=False)


        # print 'post_mps_object.std', pooled_object_features.data.std()
        # print 'post_mps_phrase.std', pooled_phrase_features.data.std()
        # print 'post_mps_region.std', pooled_region_features.data.std()

        self.timer.tic()

        # pooled_object_features = F.relu(pooled_object_features)
        # pooled_phrase_features = F.relu(pooled_phrase_features)
        pooled_region_features = F.relu(pooled_region_features)

        # cls_score_object = self.score_obj(pooled_object_features)
        # cls_prob_object = F.softmax(cls_score_object)
        #
        # cls_score_predicate = self.score_pred(pooled_phrase_features)
        # cls_prob_predicate = F.softmax(cls_score_predicate)
        #
        # if not self.use_region_reg:
        bbox_region = Variable(torch.zeros(pooled_region_features.size(0), 4).cuda())


        cls_objectiveness_region = self.objectiveness(pooled_region_features)

        # print 'cls_score_object.std', cls_score_object.data.std()
        # print 'cls_pred_box.std', bbox_object.data.std()
        # print 'cls_score_phrase.std', cls_score_predicate.data.std()

        if TIME_IT:
            torch.cuda.synchronize()
            print '\t[Post-MPS]: %.3fs' % self.timer.toc(average=False)

        # if DEBUG:
        #     print 'cls_score_predicate'
        #     print cls_score_predicate
        #     print 'roi_data_predicate[1]'
        #     print roi_data_predicate[1]
        #todo : when doing end to end training, handle following. it has loss_region_box, objectiveness_loss
        # if self.training:
        #
        #     # self.cross_entropy_object, self.loss_obj_box = self.build_loss_object(cls_score_object, bbox_object, roi_data_object)
        #     # self.cross_entropy_predicate, self.tp_pred, self.tf_pred, self.fg_cnt_pred, self.bg_cnt_pred = \
        #     #         self.build_loss_cls(cls_score_predicate, roi_data_predicate[1])
        #     # print 'accuracy: %2.2f%%' % (((self.tp_pred + self.tf_pred) / float(self.fg_cnt_pred + self.bg_cnt_pred)) * 100)
        #     # self.timer.tic()
        #     # if self.use_language_loss:
        #     #     self.region_caption_loss = self.caption_prediction(pooled_region_features, roi_data_region[1])
        #     # else:
        #     #     self.region_caption_loss = Variable(torch.zeros(1).cuda())
        #
        #     #if self.use_region_reg:
        #     self.loss_region_box = self.build_loss_bbox(bbox_region, roi_data_region)
        #     # print '\t[Caption]:   %.3fs' % self.timer.toc(average=False)
        #     region_caption = None
        #     self.objectiveness_loss = self.build_loss_objectiveness(cls_objectiveness_region, \
        #                                                             roi_data_region[3][:, 0].ne(0).type(torch.cuda.LongTensor))
        # else:
        #     # if self.use_language_loss:
        #     #     # region_caption, caption_logprobs = self.caption_prediction.beamsearch(pooled_region_features, 10)
        #     #     if use_beam_search:
        #     #         search_func = self.caption_prediction.beamsearch
        #     #     else:
        #     #         search_func = self.caption_prediction.baseline_search
        #     #     region_caption = search_func(pooled_region_features, 5)
        #     #     # pdb.set_trace()
        #     # else:
        #     #     region_caption = None
        #     #     caption_logprobs = None
        #
        # caption_logprobs = F.log_softmax(cls_objectiveness_region)[:, 1].squeeze().cpu().data

        #return (region_caption, bbox_region, region_rois, caption_logprobs)

        return pooled_region_features
Exemplo n.º 5
0
    def forward(self,
                im_data,
                im_info,
                gt_boxes=None,
                gt_ishard=None,
                dontcare_areas=None):
        """

        :param im_data: (1, 600, 800, 3)    numpy
        :param im_info: (1, 3)              numpy
        :param gt_boxes:
        :param gt_ishard:
        :param dontcare_areas:
        :return: feature (1, 512, 37, 50)   tensor
                 roi     (proposals, 5)     tensor
        """

        # im_data (1, 3, 600, 800) tensor
        im_data = network.np_to_variable(im_data, is_cuda=True)
        im_data = im_data.permute(0, 3, 1, 2)
        # (1, 512, 37, 50)
        features = self.features(im_data)
        rpn_conv1 = self.conv1(features)

        # ==========================================================================
        # rpn score (1, 9_anchors*2, 37, 50)
        # (1, 9_anchors*2, 37, 50)
        rpn_cls_score = self.score_conv(rpn_conv1)
        # do softmax to 2-way prob
        # (1, 2, 9*37, 50)
        rpn_cls_score_reshape = self.reshape_layer(rpn_cls_score, 2)
        rpn_cls_prob = F.softmax(rpn_cls_score_reshape, dim=1)

        # (1, 2*9, 37, 50)
        rpn_cls_prob_reshape = self.reshape_layer(
            rpn_cls_prob,
            len(self.anchor_scales) * 3 * 2)

        # =========================================================================
        # rpn boxes (1, 4*9_anchors, 37, 50)
        rpn_bbox_pred = self.bbox_conv(rpn_conv1)

        # =============================================================================
        # proposal layer
        # (proposals, 5)
        cfg_key = 'TRAIN' if self.training else 'TEST'
        rois = self.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred,
                                   im_info, cfg_key, self._feat_stride,
                                   self.anchor_scales)

        # generating training labels and build the rpn loss
        if self.training:
            assert gt_boxes is not None
            # 1. Calc each box's label (bg/fg)
            # 2. Calc each box's distance to gt box.
            rpn_data = self.anchor_target_layer(rpn_cls_score, gt_boxes,
                                                gt_ishard, dontcare_areas,
                                                im_info, self._feat_stride,
                                                self.anchor_scales)
            self.cross_entropy, self.loss_box = self.build_loss(
                rpn_cls_score_reshape, rpn_bbox_pred, rpn_data)

        return features, rois