def test_clip_area_0_degree(self):
        for _ in range(50):
            num_boxes = 100
            boxes_5d = torch.zeros(num_boxes, 5)
            boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
            boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
            boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
            boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
            # Convert from (x_ctr, y_ctr, w, h, 0) to  (x1, y1, x2, y2)
            boxes_4d = torch.zeros(num_boxes, 4)
            boxes_4d[:, 0] = boxes_5d[:, 0] - boxes_5d[:, 2] / 2.0
            boxes_4d[:, 1] = boxes_5d[:, 1] - boxes_5d[:, 3] / 2.0
            boxes_4d[:, 2] = boxes_5d[:, 0] + boxes_5d[:, 2] / 2.0
            boxes_4d[:, 3] = boxes_5d[:, 1] + boxes_5d[:, 3] / 2.0

            image_size = (500, 600)
            test_boxes_4d = Boxes(boxes_4d)
            test_boxes_5d = RotatedBoxes(boxes_5d)
            # Before clip
            areas_4d = test_boxes_4d.area()
            areas_5d = test_boxes_5d.area()
            self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5))
            # After clip
            test_boxes_4d.clip(image_size)
            test_boxes_5d.clip(image_size)
            areas_4d = test_boxes_4d.area()
            areas_5d = test_boxes_5d.area()
            self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5))
def generate_poposals(images, model, score_threshold=0):
    inputs = [{
        "image":
        torch.as_tensor(image.astype("float32").transpose(2, 0, 1)),
        "height":
        image.shape[0],
        "width":
        image.shape[1]
    } for image in images]

    with torch.no_grad():
        images = model.preprocess_image(inputs)
        features = model.backbone(images.tensor)
        proposals, _ = model.proposal_generator(images, features, None)

        features_ = [features[f] for f in model.roi_heads.box_in_features]
        box_features = model.roi_heads.box_pooler(
            features_, [x.proposal_boxes for x in proposals])
        box_features = model.roi_heads.box_head(box_features)
        proposals_scores, proposals_deltas = model.roi_heads.box_predictor(
            box_features)

        boxes_tensors = model.roi_heads.box_predictor.predict_boxes(
            (proposals_scores, proposals_deltas), proposals)
        scores = model.roi_heads.box_predictor.predict_probs(
            (proposals_scores, proposals_deltas), proposals)

        result = []
        for i in range(len(inputs)):
            image_size = proposals[i].image_size
            num_bbox_reg_classes = boxes_tensors[i].shape[1] // 4
            boxes = Boxes(boxes_tensors[i].reshape(-1, 4))
            boxes.clip(image_size)
            boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)

            img_scores = scores[i][:, :-1]
            max_scores, pred_classes = torch.max(img_scores, dim=1)

            keep_mask = max_scores > score_threshold
            filtered_scores = img_scores[keep_mask, :]
            filtered_max_scores = max_scores[keep_mask]
            filtered_pred_classes = pred_classes[keep_mask]
            boxes = boxes[keep_mask, filtered_pred_classes, :]

            result_instance = Instances(image_size)
            result_instance.pred_boxes = Boxes(boxes)
            result_instance.scores = filtered_max_scores
            result_instance.pred_classes = filtered_pred_classes
            result_instance.class_distributions = filtered_scores
            result.append(result_instance)

    return result
    def forward(self, features, all_phrase_ids, targets, precomp_boxes, precomp_score,
                precomp_det_label, image_scale, all_sent_sgs, all_sentences, image_unique_id, det_label_embedding):

        """
        :param obj_proposals: proposal from each images
        :param features: features maps from the backbone
        :param target: gt relation labels
        :param object_vocab, object_vocab_len [[xxx,xxx],[xxx],[xxx]], [2,1,1]
        :param sent_sg: sentence scene graph
        :return: prediction, loss

        note that first dimension is images
        """
        img_num_per_gpu = len(features)

        batch_decode_logits = []
        batch_topk_decoder_logits = []
        batch_pred_similarity = []
        batch_precomp_boxes = []
        batch_topk_precomp_boxes=[]
        batch_pred_boxes = []
        batch_topk_pred_boxes = []
        batch_topk_fusion_pred_boxes = []
        batch_topk_pred_similarity = []
        batch_topk_fusion_similarity = []
        batch_boxes_targets = []
        batch_ctx_embed = []
        batch_ctx_s1_embed = []

        batch_pred_targets = []
        batch_topk_pred_targets = []


        """ Language Embedding"""
        batch_phrase_ids, batch_phrase_types, batch_phrase_embed, batch_phrase_len, \
        batch_phrase_dec_ids, batch_phrase_mask, batch_decoder_word_embed, batch_phrase_glove_embed, batch_rel_phrase_embed, batch_relation_conn, batch_sent_embed,\
        batch_decoder_rel_word_embed, batch_rel_mask, batch_rel_dec_idx = self.phrase_embed(all_sentences, all_phrase_ids, all_sent_sgs)

        h, w = features.shape[-2:]

        # self.storage = get_event_storage()


        for bid in range(img_num_per_gpu):

            """ Visual Embedding """
            precomp_boxes_bid = precomp_boxes[bid].to(self.device)  ## 100*4

            order = []
            for phr_ids in batch_phrase_ids[bid]:
                order.append(all_phrase_ids[bid].index(phr_ids))
            target_filter = targets[bid][np.array(order)]
            batch_boxes_targets.append(target_filter.to(self.device))
            batch_precomp_boxes.append(precomp_boxes_bid)

            img_feat_bid = features[[bid]]
            visual_features_bid = self.rcnn_top(self.det_roi_pooler([img_feat_bid], [precomp_boxes_bid])).mean(dim=[2, 3]).contiguous()
            if cfg.MODEL.VG.SPATIAL_FEAT:
                spa_feat = meshgrid_generation(h, w)
                spa_feat = self.det_roi_pooler([spa_feat], [precomp_boxes_bid]).view(visual_features_bid.shape[0], -1)
                spa_feat = self.spatial_trans(spa_feat)
                visual_features_bid = torch.cat((visual_features_bid, spa_feat), dim=1)

            visual_features_bid = self.visual_embedding(visual_features_bid)
            visual_features_bid = self.vis_batchnorm(visual_features_bid)

            """ Noun Phrase embedding """
            phrase_embed_bid = batch_phrase_embed[bid]
            if phrase_embed_bid.shape[0] == 1 and self.training:
                phrase_embed_bid = self.phr_batchnorm(phrase_embed_bid.repeat(2,1))[[0]]
            else:
                phrase_embed_bid = self.phr_batchnorm(phrase_embed_bid)


            """ Similarity and attention prediction """
            num_box = precomp_boxes_bid.tensor.size(0)
            num_phrase = phrase_embed_bid.size(0)
            phr_inds, obj_inds = self.make_pair(num_phrase, num_box)
            pred_similarity_bid, pred_targets_bid = self.similarity(visual_features_bid, phrase_embed_bid, obj_inds, phr_inds)
            pred_similarity_bid = pred_similarity_bid.reshape(num_phrase, num_box)
            pred_targets_bid = pred_targets_bid.reshape(num_phrase, num_box, 4)
            batch_pred_targets.append(pred_targets_bid)


            if cfg.MODEL.VG.USING_DET_KNOWLEDGE :
                det_label_embedding_bid = det_label_embedding[bid].to(self.device)
                sim = self.cal_det_label_sim_max(det_label_embedding_bid, batch_phrase_glove_embed[bid])
                pred_similarity_bid = pred_similarity_bid * sim
                sim_mask = (sim > 0).float()
                atten_bid = numerical_stability_masked_softmax(pred_similarity_bid, sim_mask, dim=1)
            else:
                atten_bid = F.softmax(pred_similarity_bid, dim=1)

            ## reconstruction visual features
            visual_reconst_bid = torch.mm(atten_bid, visual_features_bid)
            decode_phr_logits = self.phrase_decoder(visual_reconst_bid, batch_decoder_word_embed[bid])
            batch_decode_logits.append(decode_phr_logits)

            atten_score_topk, atten_ranking_topk = torch.topk(atten_bid, dim=1, k=self.s2_topk) ## (N, 10)
            ind_phr_topk = np.arange(num_phrase).repeat(self.s2_topk)


            ## -----------------------------------------------------##
            ## crop 2st features
            ## -----------------------------------------------------##

            if self.storage.iter <= cfg.SOLVER.REG_START_ITER:
                visual_features_topk_bid = visual_features_bid[atten_ranking_topk.reshape(-1)]
                precomp_boxes_topk_bid = precomp_boxes_bid[atten_ranking_topk.reshape(-1)]
                batch_topk_precomp_boxes.append(precomp_boxes_topk_bid)
            else:
                topk_box_ids = atten_ranking_topk.reshape(-1) + torch.as_tensor(ind_phr_topk, dtype=torch.long).to(self.device)*num_box
                precomp_boxes_tensor, box_size = precomp_boxes_bid.tensor, precomp_boxes_bid.size
                precomp_boxes_topk_tensor = precomp_boxes_tensor[atten_ranking_topk.reshape(-1)]  ## (N*10, 4)
                pred_targets_s0 = pred_targets_bid.view(-1, 4)[topk_box_ids]
                precomp_boxes_topk_bid = self.box2box_translation.apply_deltas(pred_targets_s0, precomp_boxes_topk_tensor)
                precomp_boxes_topk_bid = Boxes(precomp_boxes_topk_bid, box_size)
                precomp_boxes_topk_bid.clip()
                batch_topk_precomp_boxes.append(precomp_boxes_topk_bid)
                visual_features_topk_bid = self.rcnn_top(self.det_roi_pooler([img_feat_bid], [precomp_boxes_topk_bid])).mean(dim=[2, 3]).contiguous()

                if cfg.MODEL.VG.SPATIAL_FEAT:
                    spa_feat = meshgrid_generation(h, w)
                    spa_feat = self.det_roi_pooler([spa_feat], [precomp_boxes_topk_bid]).view(visual_features_topk_bid.shape[0], -1)
                    spa_feat = self.spatial_trans(spa_feat)
                    visual_features_topk_bid = torch.cat((visual_features_topk_bid, spa_feat), dim=1)

                visual_features_topk_bid = self.visual_embedding(visual_features_topk_bid)## (N*10, 1024)
                visual_features_topk_bid = self.vis_batchnorm(visual_features_topk_bid)


            pred_similarity_topk_bid, pred_targets_topk_bid = self.similarity_topk(visual_features_topk_bid, phrase_embed_bid, ind_phr_topk)
            pred_similarity_topk_bid = pred_similarity_topk_bid.reshape(num_phrase, self.s2_topk)
            pred_targets_topk_bid = pred_targets_topk_bid.reshape(num_phrase, self.s2_topk, 4)
            batch_topk_pred_targets.append(pred_targets_topk_bid)


            if cfg.MODEL.VG.USING_DET_KNOWLEDGE:
                sim_topk = torch.gather(sim, dim=1, index=atten_ranking_topk.long())
                sim_mask = (sim_topk>0).float()
                pred_similarity_topk_bid = pred_similarity_topk_bid * sim_topk
                atten_topk_bid = numerical_stability_masked_softmax(pred_similarity_topk_bid, sim_mask, dim=1)
            else:
                atten_topk_bid = F.softmax(pred_similarity_topk_bid, dim=1)

            atten_fusion = atten_topk_bid * atten_score_topk  ## N*10
            visual_features_topk_bid = visual_features_topk_bid.view(num_phrase, self.s2_topk, -1)
            visual_reconst_topk_bid = (atten_fusion.unsqueeze(2)*visual_features_topk_bid).sum(1) ## N*1024
            decoder_phr_topk_logits = self.phrase_decoder(visual_reconst_topk_bid, batch_decoder_word_embed[bid])
            batch_topk_decoder_logits.append(decoder_phr_topk_logits)


            ## construct the discriminative loss
            batch_ctx_s1_embed.append(self.visual_mlp(visual_reconst_bid.mean(0, keepdim=True)))
            batch_ctx_embed.append(self.visual_mlp(visual_reconst_topk_bid.mean(0, keepdim=True)))


            batch_pred_similarity.append(atten_bid)
            batch_topk_pred_similarity.append(atten_topk_bid)
            batch_topk_fusion_similarity.append(atten_fusion)

            ### transform boxes for stage-1
            num_phrase_indices = torch.arange(num_phrase).long().to(self.device)
            max_box_ind = atten_bid.detach().cpu().numpy().argmax(1)
            precomp_boxes_delta_max = pred_targets_bid[num_phrase_indices, max_box_ind] ## numPhrase*4

            max_topk_id = torch.topk(atten_topk_bid, dim=1, k=1)[1].long().squeeze(1)
            precomp_boxes_delta_max_topk = pred_targets_topk_bid[num_phrase_indices, max_topk_id]  ## num_phrase*4
            precomp_boxes_topk_bid_tensor = precomp_boxes_topk_bid.tensor.reshape(-1, self.s2_topk, 4)

            max_fusion_topk_id = torch.topk(atten_fusion, dim=1, k=1)[1].long().squeeze()
            precomp_boxes_delta_max_topk_fusion = pred_targets_topk_bid[num_phrase_indices, max_fusion_topk_id]  ## num_phrase*4

            phr_index = torch.arange(num_phrase).to(self.device) * self.s2_topk

            if self.storage.iter <= cfg.SOLVER.REG_START_ITER:
                max_select_boxes = precomp_boxes_bid[max_box_ind]
                max_precomp_boxes = precomp_boxes_topk_bid[max_topk_id + phr_index]
                max_fusion_precomp_boxes = precomp_boxes_topk_bid[max_fusion_topk_id + phr_index]
            else:
                max_select_boxes = Boxes(self.box2box_translation.apply_deltas(precomp_boxes_delta_max, precomp_boxes_bid[max_box_ind].tensor), precomp_boxes_bid.size)
                max_precomp_boxes = Boxes(self.box2box_translation.apply_deltas(precomp_boxes_delta_max_topk, precomp_boxes_topk_bid_tensor[num_phrase_indices, max_topk_id]), precomp_boxes_bid.size)
                max_fusion_precomp_boxes = Boxes(self.box2box_translation.apply_deltas(precomp_boxes_delta_max_topk_fusion, precomp_boxes_topk_bid_tensor[num_phrase_indices, max_fusion_topk_id]), precomp_boxes_bid.size)

            batch_pred_boxes.append(max_select_boxes)
            batch_topk_pred_boxes.append(max_precomp_boxes)
            batch_topk_fusion_pred_boxes.append(max_fusion_precomp_boxes)


        batch_ctx_sim, batch_ctx_sim_s1 = self.generate_image_sent_discriminative(batch_sent_embed, batch_ctx_embed, batch_ctx_s1_embed)

        noun_reconst_loss, noun_topk_reconst_loss, disc_img_sent_loss_s1, disc_img_sent_loss_s2,  reg_loss, \
        reg_loss_s1 = self.VGLoss(batch_phrase_mask, batch_decode_logits, batch_topk_decoder_logits, batch_phrase_dec_ids,
                                  batch_ctx_sim, batch_ctx_sim_s1, batch_pred_similarity, batch_topk_pred_similarity, batch_boxes_targets, batch_precomp_boxes,
                                  batch_pred_targets, batch_topk_pred_targets,
                                  batch_topk_precomp_boxes)

        all_loss = dict(noun_reconst_loss=noun_reconst_loss, noun_topk_reconst_loss=noun_topk_reconst_loss, disc_img_sent_loss_s1=disc_img_sent_loss_s1,
                        disc_img_sent_loss_s2=disc_img_sent_loss_s2, reg_loss_s1=reg_loss, reg_loss_s2=reg_loss_s1)


        if self.training:
            return all_loss, None
        else:
            return all_loss, (batch_phrase_ids, batch_phrase_types, move2cpu(batch_pred_boxes), move2cpu(batch_pred_similarity),
                              move2cpu(batch_boxes_targets), move2cpu(batch_precomp_boxes), image_unique_id, move2cpu(batch_topk_pred_similarity),
                              move2cpu(batch_topk_fusion_similarity), move2cpu(batch_topk_pred_boxes), move2cpu(batch_topk_fusion_pred_boxes),
                              move2cpu(batch_topk_precomp_boxes), move2cpu(batch_topk_pred_targets), move2cpu(batch_pred_targets))