def visualize_top_predicted_bbox(self, pred_sample, coco_image_dir):
   """Visualize the top predicted bounding box."""
   assert 'annotation_id' in pred_sample, 'Object annotation id missing!'
   assert 'predicted_bounding_boxes' in pred_sample, \
          'list of predicted bounding boxes missing!'
   if not pred_sample['predicted_bounding_boxes']:
     print 'Empty predicted bounding boxes.'
     return
     
   bbox_pred_top = pred_sample['predicted_bounding_boxes'][0]
   ann_id = pred_sample['annotation_id']
   ann = self.refexp_dataset.loadAnns(ids=[ann_id])[0]
   image_id = ann['image_id']
   img_coco = self.refexp_dataset.loadImgs(ids=[image_id])[0]
   iou = cu.iou_bboxes(bbox_pred_top, ann['bbox'])
   
   if 'refexp' in pred_sample or 'refexp_id' in pred_sample:
     print 'The Referring expression input to the model is:'
     if 'refexp' in pred_sample:
       print '  ' + pred_sample['refexp']
     else:
       refexp_tmp = self.refexp_dataset.loadRefexps(ids=pred_sample['refexp_id'])[0]
       print '  ' + refexp_tmp['raw']
   
   I = misc.imread(os.path.join(coco_image_dir, (img_coco['file_name'])))
   ax = plt.imshow(I)
   ax = plt.axis('off')
   ax = plt.title('IoU: %.3f, green bbox: GT, red bbox: predicted' % iou)
   cu.draw_bbox(plt.gca(), ann['bbox'], edge_color='green')
   cu.draw_bbox(plt.gca(), bbox_pred_top, edge_color='red')
示例#2
0
    def visualize_top_predicted_bbox(self, pred_sample, coco_image_dir):
        """Visualize the top predicted bounding box."""
        assert 'annotation_id' in pred_sample, 'Object annotation id missing!'
        assert 'predicted_bounding_boxes' in pred_sample, \
               'list of predicted bounding boxes missing!'
        if not pred_sample['predicted_bounding_boxes']:
            print 'Empty predicted bounding boxes.'
            return

        bbox_pred_top = pred_sample['predicted_bounding_boxes'][0]
        ann_id = pred_sample['annotation_id']
        ann = self.refexp_dataset.loadAnns(ids=[ann_id])[0]
        image_id = ann['image_id']
        img_coco = self.refexp_dataset.loadImgs(ids=[image_id])[0]
        iou = cu.iou_bboxes(bbox_pred_top, ann['bbox'])

        if 'refexp' in pred_sample or 'refexp_id' in pred_sample:
            print 'The Referring expression input to the model is:'
            if 'refexp' in pred_sample:
                print '  ' + pred_sample['refexp']
            else:
                refexp_tmp = self.refexp_dataset.loadRefexps(
                    ids=pred_sample['refexp_id'])[0]
                print '  ' + refexp_tmp['raw']

        I = misc.imread(os.path.join(coco_image_dir, (img_coco['file_name'])))
        ax = plt.imshow(I)
        ax = plt.axis('off')
        ax = plt.title('IoU: %.3f, green bbox: GT, red bbox: predicted' % iou)
        cu.draw_bbox(plt.gca(), ann['bbox'], edge_color='green')
        cu.draw_bbox(plt.gca(), bbox_pred_top, edge_color='red')
示例#3
0
    def get_streams(self):
        ((image_filename, image_id), object_id_list,
         line) = self.image_refexp_pairs[self.index]
        if image_id in self.dataset.imgs_with_errors:
            line = EOS_IDENTIFIER

        stream = get_encoded_line(line, self.vocabulary)
        # Assumes stream has EOS word at the end
        assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER])
        stream = stream[:-1]
        filtered_stream = []
        for word in stream:
            if word != self.vocabulary[UNK_IDENTIFIER]:
                filtered_stream.append(word)
        stream = filtered_stream
        if self.truncate and len(stream) >= self.max_words:
            stream = stream[:self.max_words - 1]
            self.num_truncates += 1

        object_id = object_id_list[1]
        object_ann = self.dataset.loadAnns(object_id)[0]
        object_category = self.dataset.loadCats(
            object_ann['category_id'])[0]['name']
        object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
        context_anns_of_same_category = []
        context_anns_of_diff_category = []
        if hasattr(self.dataset, 'coco'):
            all_anns = self.dataset.coco.imgToAnns[image_id]
        else:
            all_anns = self.dataset.imgToAnns[image_id]
        for ann in all_anns:
            if ann['id'] != object_id:
                if ann['category_id'] == object_ann['category_id']:
                    context_anns_of_same_category.append(ann)
                else:
                    context_anns_of_diff_category.append(ann)

        neg_anns_of_same_category = []
        neg_anns_of_diff_category = []
        if self.neg_proposal_source != 'gt':
            image_info = self.dataset.loadImgs(image_id)[0]
            all_anns = image_info['region_candidates']
            for ann in all_anns:
                ann['bbox'] = ann['bounding_box']
                ann_box = ann['bbox']
                iou = iou_bboxes(ann_box, object_bbox)
                if iou < 0.5 and ann[
                        'predicted_object_name'] == object_category:
                    neg_anns_of_same_category.append(ann)
                elif ann['predicted_object_name'] != object_category:
                    neg_anns_of_diff_category.append(ann)
        else:
            neg_anns_of_same_category = context_anns_of_same_category
            neg_anns_of_diff_category = context_anns_of_diff_category

        # subtract one because image is reserved as one context region
        if len(context_anns_of_same_category) > self.max_num_context - 1:
            rand_sample = sorted(
                random.sample(range(len(context_anns_of_same_category)),
                              self.max_num_context - 1))
            context_anns_of_same_category = [
                context_anns_of_same_category[idx] for idx in rand_sample
            ]
        elif len(context_anns_of_same_category) < self.max_num_context - 1:
            rand_sample = sorted(
                random.sample(
                    range(len(context_anns_of_diff_category)),
                    min(
                        self.max_num_context - 1 -
                        len(context_anns_of_same_category),
                        len(context_anns_of_diff_category))))
            context_anns_of_same_category += [
                context_anns_of_diff_category[idx] for idx in rand_sample
            ]

        if len(neg_anns_of_same_category) > self.max_num_negatives:
            rand_sample = sorted(
                random.sample(range(len(neg_anns_of_same_category)),
                              self.max_num_negatives))
            neg_anns_of_same_category = [
                neg_anns_of_same_category[idx] for idx in rand_sample
            ]
        elif len(neg_anns_of_same_category) < self.max_num_negatives:
            rand_sample = sorted(
                random.sample(
                    range(len(neg_anns_of_diff_category)),
                    min(
                        self.max_num_negatives -
                        len(neg_anns_of_same_category),
                        len(neg_anns_of_diff_category))))
            neg_anns_of_same_category += [
                neg_anns_of_diff_category[idx] for idx in rand_sample
            ]

            # If we are running short of proposal negatives, sample from gt negatives
            if len(
                    neg_anns_of_same_category
            ) < self.max_num_negatives and self.neg_proposal_source != 'gt':
                rand_sample = sorted(
                    random.sample(
                        range(len(context_anns_of_diff_category)),
                        min(
                            self.max_num_negatives -
                            len(neg_anns_of_same_category),
                            len(context_anns_of_diff_category))))
                neg_anns_of_same_category += [
                    context_anns_of_diff_category[idx] for idx in rand_sample
                ]

        pad = self.max_words - (len(stream) + 1) if self.pad else 0
        if pad > 0:
            self.num_pads += 1

        out = {}
        timestep_input = np.asarray(
            [[self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad],
            np.float16)
        out['timestep_input'] = np.tile(timestep_input.T,
                                        (1, self.max_num_context))
        timestep_cont = np.asarray([[0] + [1] * len(stream) + [0] * pad],
                                   np.float16)
        out['timestep_cont'] = np.tile(timestep_cont.T,
                                       (1, self.max_num_context))
        timestep_target = np.asarray(
            stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad,
            np.float16)
        out['timestep_target'] = timestep_target
        self.swap_axis_streams.add('timestep_input')
        self.swap_axis_streams.add('timestep_target')
        self.swap_axis_streams.add('timestep_cont')

        # Write image features to batch
        img_info = self.dataset.loadImgs(image_id)[0]
        img_wd = float(img_info['width'])
        img_ht = float(img_info['height'])
        assert (len(object_id_list) <= 2)
        fc7_img = self.dataset.image_features[str(
            (image_id, [0, 0, int(img_wd - 1),
                        int(img_ht - 1)]))][0]
        out['fc7_img'] = np.tile(fc7_img, (self.max_num_context, 1))
        img_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
        img_bbox_features[:] = [0, 0, 1, 1, 1]
        out['img_bbox_features'] = img_bbox_features

        # Write object region features to batch
        object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
        fc7_obj = self.dataset.image_features[str((image_id, object_bbox))][0]
        out['fc7_obj'] = np.tile(fc7_obj, (self.max_num_context, 1))

        bbox_area_ratio = (object_bbox[2] * object_bbox[3]) / (img_wd * img_ht)
        bbox_x1y1x2y2 = [
            object_bbox[0] / img_wd, object_bbox[1] / img_ht,
            (object_bbox[0] + object_bbox[2]) / img_wd,
            (object_bbox[1] + object_bbox[3]) / img_ht
        ]
        bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
        out['bbox_features'] = np.tile(bbox_features,
                                       (self.max_num_context, 1))

        # Write context features to batch
        context_fc7 = np.tile(fc7_img, (self.max_num_context, 1))
        context_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
        context_bbox_features[:] = [0, 0, 1, 1, 1]
        if len(context_anns_of_same_category) > 0:
            other_bboxes = [
                ann['bbox'] for ann in context_anns_of_same_category
            ]
            for idx, other_bbox in enumerate(other_bboxes):
                other_bbox_area_ratio = (other_bbox[2] *
                                         other_bbox[3]) / (img_wd * img_ht)
                other_bbox_x1y1x2y2 = [
                    other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                    (other_bbox[0] + other_bbox[2]) / img_wd,
                    (other_bbox[1] + other_bbox[3]) / img_ht
                ]
                other_bbox_features = other_bbox_x1y1x2y2 + [
                    other_bbox_area_ratio
                ]
                context_fc7[idx, :] = self.dataset.image_features[str(
                    (image_id, other_bbox))][0]
                context_bbox_features[idx, :] = other_bbox_features
        out['context_fc7'] = context_fc7
        out['context_bbox_features'] = context_bbox_features

        # Write negative features to batch
        negative_fc7 = np.zeros(
            (self.max_num_negatives, self.dataset.image_feature_length),
            np.float16)
        negative_bbox_features = np.zeros((self.max_num_negatives, 5),
                                          np.float16)
        if len(neg_anns_of_same_category) > 0:
            other_bboxes = [ann['bbox'] for ann in neg_anns_of_same_category]
            for idx, other_bbox in enumerate(other_bboxes):
                other_bbox_area_ratio = (other_bbox[2] *
                                         other_bbox[3]) / (img_wd * img_ht)
                other_bbox_x1y1x2y2 = [
                    other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                    (other_bbox[0] + other_bbox[2]) / img_wd,
                    (other_bbox[1] + other_bbox[3]) / img_ht
                ]
                other_bbox_features = other_bbox_x1y1x2y2 + [
                    other_bbox_area_ratio
                ]
                negative_fc7[idx, :] = self.dataset.image_features[str(
                    (image_id, other_bbox))][0]
                negative_bbox_features[idx, :] = other_bbox_features
        out['negative_fc7'] = negative_fc7
        out['negative_bbox_features'] = negative_bbox_features

        pairwise_similarity = np.asarray([[0] * self.max_num_negatives],
                                         np.float16)
        out['pairwise_similarity'] = np.tile(pairwise_similarity,
                                             (self.max_words, 1))
        self.swap_axis_streams.add('pairwise_similarity')

        self.num_outs += 1
        self.next_line()
        return out
 def evaluate(self, pred_results_path,
              thresh_iou=0.5,
              thresh_k=1,
              flag_ignore_non_existed_object=False,
              flag_ignore_non_existed_gt_refexp=False,
              flag_missing_objects_verbose=False,
              flag_missing_refexps_verbose=False):
   """Evaluate the predicted results for the comprehension task.
   
   Args:
     pred_results_path: path for the predicted results with the format
         described in ./cache_evaluation/format_comprehension_eval.md
     thresh_iou: threshold of the IoU ratio of the evaluation
     thresh_k: precision@k
     flag_ignore_non_existed_object: if set True, the evaluation process
         continues with an warning when encountered non existed objects in 
         self.refexp_dataset. Otherwise stops.
     flag_ignore_non_existed_gt_refexp: if set True, the evaluation process  
         continues when encountered non existed GT referring expressions.
         Otherwise stops.
     flag_missing_objects_verbose: if set true, will list the ids of all the 
         missing objects in self.refexp_dataset
     flag_missing_refexps_verbose: if set true, will list the ids of all the 
         missing referring expressions in self.refexp_dataset
         
   Returns:
     A two element tuple. The first element is precision@k. The second
     element is the predicted results (a dictionary) with an added field
     'best_iou' of the best iou for the top k bounding boxes.
   """
   # Load predicted results
   self.reset_eval_state()
   print 'Loading predicted result file for the comprehension task.'
   with open(pred_results_path) as fin:
     self.pred_results = json.load(fin)
   
   # evaluation
   pred_ann_ids_set = set()
   pred_refexp_ids_set = set()
   score = 0.0
   num_valid_pred = 0
   for pred_elem in self.pred_results:
     # validate the predicted results
     assert 'annotation_id' in pred_elem, 'Object annotation id missing!'
     assert 'predicted_bounding_boxes' in pred_elem, \
            'list of predicted bounding boxes missing!'
     ann_id = pred_elem['annotation_id']
     gt_bbox = self._get_GT_bbox_with_annotation_id(ann_id) # Need to check - change
     if gt_bbox is None:
       if flag_ignore_non_existed_object:
         print ('Ignore COCO annotation id %d which does not exist in '
                'Refexp dataset file for evaluation' % ann_id)
         pred_elem['best_iou'] = 0.0
         continue
       else:
         print ('COCO annotation id %d does not exist in Refexp '
                'dataset file for evaluation!' % ann_id)
         raise
     if ('refexp_id' in pred_elem) and not(pred_elem['refexp_id'] in self.gt_refexp_ids_set):
       if flag_ignore_non_existed_gt_refexp:
         print ('Ignore refexp id %d which does not exist in '
                'Refexp dataset file for evaluation' % pred_elem['refexp_id'])
         pred_elem['best_iou'] = 0.0
         continue
       else:
         print ('refexp id %d does not exist in Refexp '
                'dataset file for evaluation!' % pred_elem['refexp_id'])
         raise
     pred_ann_ids_set.add(ann_id)
     if 'refexp_id' in pred_elem:
       pred_refexp_ids_set.add(pred_elem['refexp_id'])
     num_valid_pred += 1
         
     # check whether it is a correct prediction
     pred_bboxes = pred_elem['predicted_bounding_boxes']
     best_iou = 0.0
     for k in xrange(min(thresh_k, len(pred_bboxes))):
       iou = cu.iou_bboxes(pred_bboxes[k], gt_bbox)
       best_iou = max(best_iou, iou)
     if best_iou >= thresh_iou:
       score += 1.0
     pred_elem['best_iou'] = best_iou
   score /= num_valid_pred
     
   # warning for missing objects and refexps
   gt_ann_ids_left_set = self.gt_ann_ids_set - pred_ann_ids_set
   gt_refexp_ids_left_set = self.gt_refexp_ids_set - pred_refexp_ids_set
   if gt_ann_ids_left_set:
     print ('Missing %d objects in the refexp dataset file in the predicted '
            'file' % len(gt_ann_ids_left_set))
     if flag_missing_objects_verbose:
       print ('The missing object annotation ids are:')
       print gt_ann_ids_left_set  # TODO pretty print format
   if gt_refexp_ids_left_set:
     print ('Missing %d refexps in the refexp dataset file in the predicted '
            'file' % len(gt_refexp_ids_left_set))
     if flag_missing_refexps_verbose:
       print ('The missing refexp ids are:')
       print gt_refexp_ids_left_set  # TODO pretty print format
     
   # summarize the results
   print 'The average prec@%d score is %.3f' % (thresh_k, score)
   return (score, self.pred_results)
示例#5
0
    def evaluate(self,
                 pred_results_path,
                 thresh_iou=0.5,
                 thresh_k=1,
                 flag_ignore_non_existed_object=False,
                 flag_ignore_non_existed_gt_refexp=False,
                 flag_missing_objects_verbose=False,
                 flag_missing_refexps_verbose=False):
        """Evaluate the predicted results for the comprehension task.
    
    Args:
      pred_results_path: path for the predicted results with the format
          described in ./cache_evaluation/format_comprehension_eval.md
      thresh_iou: threshold of the IoU ratio of the evaluation
      thresh_k: precision@k
      flag_ignore_non_existed_object: if set True, the evaluation process
          continues with an warning when encountered non existed objects in 
          self.refexp_dataset. Otherwise stops.
      flag_ignore_non_existed_gt_refexp: if set True, the evaluation process  
          continues when encountered non existed GT referring expressions.
          Otherwise stops.
      flag_missing_objects_verbose: if set true, will list the ids of all the 
          missing objects in self.refexp_dataset
      flag_missing_refexps_verbose: if set true, will list the ids of all the 
          missing referring expressions in self.refexp_dataset
          
    Returns:
      A two element tuple. The first element is precision@k. The second
      element is the predicted results (a dictionary) with an added field
      'best_iou' of the best iou for the top k bounding boxes.
    """
        # Load predicted results
        self.reset_eval_state()
        print 'Loading predicted result file for the comprehension task.'
        with open(pred_results_path) as fin:
            self.pred_results = json.load(fin)

        # evaluation
        pred_ann_ids_set = set()
        pred_refexp_ids_set = set()
        score = 0.0
        num_valid_pred = 0
        for pred_elem in self.pred_results:
            # validate the predicted results
            assert 'annotation_id' in pred_elem, 'Object annotation id missing!'
            assert 'predicted_bounding_boxes' in pred_elem, \
                   'list of predicted bounding boxes missing!'
            ann_id = pred_elem['annotation_id']
            gt_bbox = self._get_GT_bbox_with_annotation_id(
                ann_id)  # Need to check - change
            if gt_bbox is None:
                if flag_ignore_non_existed_object:
                    print(
                        'Ignore COCO annotation id %d which does not exist in '
                        'Refexp dataset file for evaluation' % ann_id)
                    pred_elem['best_iou'] = 0.0
                    continue
                else:
                    print(
                        'COCO annotation id %d does not exist in Refexp '
                        'dataset file for evaluation!' % ann_id)
                    raise
            if ('refexp_id' in pred_elem) and not (pred_elem['refexp_id']
                                                   in self.gt_refexp_ids_set):
                if flag_ignore_non_existed_gt_refexp:
                    print(
                        'Ignore refexp id %d which does not exist in '
                        'Refexp dataset file for evaluation' %
                        pred_elem['refexp_id'])
                    pred_elem['best_iou'] = 0.0
                    continue
                else:
                    print(
                        'refexp id %d does not exist in Refexp '
                        'dataset file for evaluation!' %
                        pred_elem['refexp_id'])
                    raise
            pred_ann_ids_set.add(ann_id)
            if 'refexp_id' in pred_elem:
                pred_refexp_ids_set.add(pred_elem['refexp_id'])
            num_valid_pred += 1

            # check whether it is a correct prediction
            pred_bboxes = pred_elem['predicted_bounding_boxes']
            best_iou = 0.0
            for k in xrange(min(thresh_k, len(pred_bboxes))):
                iou = cu.iou_bboxes(pred_bboxes[k], gt_bbox)
                best_iou = max(best_iou, iou)
            if best_iou >= thresh_iou:
                score += 1.0
            pred_elem['best_iou'] = best_iou
        score /= num_valid_pred

        # warning for missing objects and refexps
        gt_ann_ids_left_set = self.gt_ann_ids_set - pred_ann_ids_set
        gt_refexp_ids_left_set = self.gt_refexp_ids_set - pred_refexp_ids_set
        if gt_ann_ids_left_set:
            print(
                'Missing %d objects in the refexp dataset file in the predicted '
                'file' % len(gt_ann_ids_left_set))
            if flag_missing_objects_verbose:
                print('The missing object annotation ids are:')
                print gt_ann_ids_left_set  # TODO pretty print format
        if gt_refexp_ids_left_set:
            print(
                'Missing %d refexps in the refexp dataset file in the predicted '
                'file' % len(gt_refexp_ids_left_set))
            if flag_missing_refexps_verbose:
                print('The missing refexp ids are:')
                print gt_refexp_ids_left_set  # TODO pretty print format

        # summarize the results
        print 'The average prec@%d score is %.3f' % (thresh_k, score)
        return (score, self.pred_results)
  def get_streams(self):
    ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index]
    if image_id in self.dataset.imgs_with_errors:
      line = EOS_IDENTIFIER

    stream = get_encoded_line(line, self.vocabulary)
    # Assumes stream has EOS word at the end
    assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER])
    stream = stream[:-1]
    filtered_stream = []
    for word in stream:
      if word != self.vocabulary[UNK_IDENTIFIER]:
        filtered_stream.append(word)
    stream = filtered_stream
    if self.truncate and len(stream) >= self.max_words:
      stream = stream[:self.max_words-1]
      self.num_truncates += 1

    object_id = object_id_list[1]
    object_ann = self.dataset.loadAnns(object_id)[0]
    object_category = self.dataset.loadCats(object_ann['category_id'])[0]['name']
    object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
    context_anns_of_same_category = []
    context_anns_of_diff_category = []
    if hasattr(self.dataset, 'coco'):
      all_anns = self.dataset.coco.imgToAnns[image_id]
    else:
      all_anns = self.dataset.imgToAnns[image_id]
    for ann in all_anns:
      if ann['id'] != object_id:
        if ann['category_id'] == object_ann['category_id']:
          context_anns_of_same_category.append(ann)
        else:
          context_anns_of_diff_category.append(ann)

    neg_anns_of_same_category = []
    neg_anns_of_diff_category = []
    if self.neg_proposal_source != 'gt':
      image_info = self.dataset.loadImgs(image_id)[0]
      all_anns = image_info['region_candidates']
      for ann in all_anns:
        ann['bbox'] = ann['bounding_box']
        ann_box = ann['bbox']
        iou = iou_bboxes(ann_box, object_bbox)
        if iou < 0.5 and ann['predicted_object_name'] == object_category:
          neg_anns_of_same_category.append(ann)
        elif ann['predicted_object_name'] != object_category:
          neg_anns_of_diff_category.append(ann)
    else:
      neg_anns_of_same_category = context_anns_of_same_category
      neg_anns_of_diff_category = context_anns_of_diff_category

    # subtract one because image is reserved as one context region
    if len(context_anns_of_same_category) > self.max_num_context-1:
      rand_sample = sorted(random.sample(range(len(context_anns_of_same_category)), self.max_num_context - 1))
      context_anns_of_same_category = [context_anns_of_same_category[idx] for idx in rand_sample]
    elif len(context_anns_of_same_category) < self.max_num_context-1:
      rand_sample = sorted(random.sample(range(len(context_anns_of_diff_category)),
                                         min(self.max_num_context - 1 - len(context_anns_of_same_category),
                                             len(context_anns_of_diff_category))))
      context_anns_of_same_category += [context_anns_of_diff_category[idx] for idx in rand_sample]

    if len(neg_anns_of_same_category) > self.max_num_negatives:
      rand_sample = sorted(random.sample(range(len(neg_anns_of_same_category)),self.max_num_negatives))
      neg_anns_of_same_category = [neg_anns_of_same_category[idx] for idx in rand_sample]
    elif len(neg_anns_of_same_category) < self.max_num_negatives:
      rand_sample = sorted(random.sample(range(len(neg_anns_of_diff_category)),
                                         min(self.max_num_negatives-len(neg_anns_of_same_category),
                                             len(neg_anns_of_diff_category))))
      neg_anns_of_same_category += [neg_anns_of_diff_category[idx] for idx in rand_sample]

      # If we are running short of proposal negatives, sample from gt negatives
      if len(neg_anns_of_same_category) < self.max_num_negatives and self.neg_proposal_source != 'gt':
        rand_sample = sorted(random.sample(range(len(context_anns_of_diff_category)),
                                           min(self.max_num_negatives-len(neg_anns_of_same_category),
                                               len(context_anns_of_diff_category))))
        neg_anns_of_same_category += [context_anns_of_diff_category[idx] for idx in rand_sample]

    pad = self.max_words - (len(stream) + 1) if self.pad else 0
    if pad > 0:
      self.num_pads += 1

    out = {}
    timestep_input = np.asarray([[self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad], np.float16)
    out['timestep_input'] = np.tile(timestep_input.T, (1,self.max_num_context))
    timestep_cont = np.asarray([[0] + [1] * len(stream) + [0] * pad], np.float16)
    out['timestep_cont'] = np.tile(timestep_cont.T, (1,self.max_num_context))
    timestep_target = np.asarray(stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, np.float16)
    out['timestep_target'] = timestep_target
    self.swap_axis_streams.add('timestep_input')
    self.swap_axis_streams.add('timestep_target')
    self.swap_axis_streams.add('timestep_cont')

    # Write image features to batch
    img_info = self.dataset.loadImgs(image_id)[0]
    img_wd = float(img_info['width'])
    img_ht = float(img_info['height'])
    assert(len(object_id_list) <= 2)
    fc7_img = self.dataset.image_features[str((image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0]
    out['fc7_img'] = np.tile(fc7_img, (self.max_num_context, 1))
    img_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
    img_bbox_features[:] = [0,0,1,1,1]
    out['img_bbox_features'] = img_bbox_features

    # Write object region features to batch
    object_bbox = self.dataset.loadAnns(object_id)[0]['bbox']
    fc7_obj = self.dataset.image_features[str((image_id, object_bbox))][0]
    out['fc7_obj'] = np.tile(fc7_obj, (self.max_num_context, 1))

    bbox_area_ratio = (object_bbox[2] * object_bbox[3]) / (img_wd * img_ht)
    bbox_x1y1x2y2 = [object_bbox[0] / img_wd, object_bbox[1] / img_ht,
                     (object_bbox[0] + object_bbox[2]) / img_wd, (object_bbox[1] + object_bbox[3]) / img_ht]
    bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio]
    out['bbox_features'] = np.tile(bbox_features, (self.max_num_context, 1))

    # Write context features to batch
    context_fc7 = np.tile(fc7_img, (self.max_num_context, 1))
    context_bbox_features = np.zeros((self.max_num_context, 5), np.float16)
    context_bbox_features[:] = [0,0,1,1,1]
    if len(context_anns_of_same_category) > 0:
      other_bboxes = [ann['bbox'] for ann in context_anns_of_same_category]
      for idx, other_bbox in enumerate(other_bboxes):
        other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht)
        other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                               (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht]
        other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio]
        context_fc7[idx,:] = self.dataset.image_features[str((image_id, other_bbox))][0]
        context_bbox_features[idx,:] = other_bbox_features
    out['context_fc7'] = context_fc7
    out['context_bbox_features'] = context_bbox_features

    # Write negative features to batch
    negative_fc7 = np.zeros((self.max_num_negatives, self.dataset.image_feature_length),np.float16)
    negative_bbox_features = np.zeros((self.max_num_negatives, 5),np.float16)
    if len(neg_anns_of_same_category) > 0:
      other_bboxes = [ann['bbox'] for ann in neg_anns_of_same_category]
      for idx, other_bbox in enumerate(other_bboxes):
        other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht)
        other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht,
                               (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht]
        other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio]
        negative_fc7[idx,:] = self.dataset.image_features[str((image_id, other_bbox))][0]
        negative_bbox_features[idx,:] = other_bbox_features
    out['negative_fc7'] = negative_fc7
    out['negative_bbox_features'] = negative_bbox_features

    pairwise_similarity = np.asarray([[0] * self.max_num_negatives], np.float16)
    out['pairwise_similarity'] = np.tile(pairwise_similarity, (self.max_words,1))
    self.swap_axis_streams.add('pairwise_similarity')

    self.num_outs += 1
    self.next_line()
    return out