예제 #1
0
    def fetch_ann_spa_feat(self, ann_id, min_size=600, max_size=1000):
        # return x1, y1, x2, y2, area
        image_id = self.ann_to_image[ann_id]
        W, H = self.images[image_id]['width'], self.images[image_id]['height']
        x1, y1, w, h = self.ann_to_box[ann_id]
        x2 = max(x1 + 1, x1 + w - 1)
        y2 = max(y1 + 1, y1 + h - 1)
        area = w * h

        # scale
        scale = min(max(min_size / H, min_size / W), max_size / H,
                    max_size / W)
        new_h, new_w = int(scale * H), int(scale * W)
        region_bboxes = np.array([[x1, y1, x2, y2]], np.float32) * scale
        region_bboxes = im_processing.rectify_bboxes(region_bboxes,
                                                     height=new_h,
                                                     width=new_w)

        bbox_batch = np.zeros((len(region_bboxes), 5), np.float32)
        bbox_batch[:, 1:5] = region_bboxes
        spatial_batch = spatial_feature_from_bbox(region_bboxes,
                                                  im_h=new_h,
                                                  im_w=new_w)

        return spatial_batch[0]
예제 #2
0
    def img_box_modify(self, im, box, min_size=600, max_size=1000):
        # box[x1, y1, w, h] -> [batch_index(0), x1, y1, x2, y2]

        # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
        bbox_batch = np.zeros((len(box), 5), np.float32)
        bbox_batch[:, 1:5] = box
        bbox_batch[:,
                   3:5] = bbox_batch[:,
                                     3:5] + bbox_batch[:,
                                                       1:3] - 1  # x2 = x1+w-1

        # calculate the resize scaling factor
        im_h, im_w = im.shape[:2]
        # make the short size equal to min_size but also the long size no bigger than max_size
        scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h,
                    max_size / im_w)

        # resize and process the image
        new_h, new_w = int(scale * im_h), int(scale * im_w)
        im_resized = skimage.img_as_float(
            skimage.transform.resize(im, [new_h, new_w]))
        im_processed = im_resized * 255 - im_mean
        im_batch = im_processed[np.newaxis, ...].astype(np.float32)

        # resize and process the box
        bbox_batch[:, 1:] = im_processing.rectify_bboxes(bbox_batch[:, 1:] *
                                                         scale,
                                                         height=new_h,
                                                         width=new_w)

        return im_batch, bbox_batch
예제 #3
0
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T):
    im_path = iminfo['im_path']
    im = skimage.io.imread(im_path)
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))

    # calculate the resize scaling factor
    im_h, im_w = im.shape[:2]
    # make the short size equal to min_size but also the long size no bigger than max_size
    scale = min(max(min_size/im_h, min_size/im_w), max_size/im_h, max_size/im_w)

    # resize and process the image
    new_h, new_w = int(scale*im_h), int(scale*im_w)
    im_resized = skimage.img_as_float(skimage.transform.resize(im, [new_h, new_w]))
    im_processed = im_resized*255 - im_mean
    im_batch = im_processed[np.newaxis, ...].astype(np.float32)

    # annotate regions
    regions = iminfo['regions']
    if len(regions) == 0:
        raise IOError('no region annotations for image ' + im_path)
    region_bboxes = np.array([ann[0] for ann in regions], np.float32)
    # save coco_bboxes, needed for evaluation code
    coco_bboxes = region_bboxes.copy()
    # back to [x, y, w, h]
    coco_bboxes[:, 2:4] = coco_bboxes[:, 2:4] - coco_bboxes[:, 0:2] + 1
    region_bboxes *= scale
    region_bboxes = im_processing.rectify_bboxes(region_bboxes, height=new_h, width=new_w)

    # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
    bbox_batch = np.zeros((len(region_bboxes), 5), np.float32)
    bbox_batch[:, 1:5] = region_bboxes
    spatial_batch = spatial_feature_from_bbox(region_bboxes, im_h=new_h, im_w=new_w)

    # a region may have zero, one or more sentence annotations
    # align language sequences with regions
    text_seq_batch = []
    label_batch = []
    coco_ann_ids = []  # needed for evaluation code
    questions = []  # needed for evaluation code
    for n in range(len(regions)):
        for n_s in range(len(regions[n][1])):
            s = regions[n][1][n_s]
            text_seq_batch.append(text_processing.preprocess_sentence(s, vocab_dict, T))
            label_batch.append(n)
            coco_ann_ids.append(regions[n][2])
            questions.append(s)

    text_seq_batch = np.array(text_seq_batch, dtype=np.int32).T

    label_batch = np.array(label_batch, dtype=np.int32)

    batch=dict(text_seq_batch=text_seq_batch, im_batch=im_batch,
               bbox_batch=bbox_batch, spatial_batch=spatial_batch,
               label_batch=label_batch, coco_ann_ids=coco_ann_ids,
               questions=questions, coco_bboxes=coco_bboxes)

    return batch
예제 #4
0
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T):
    im_path = iminfo['im_path']
    im = skimage.io.imread(im_path)
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))

    # calculate the resize scaling factor
    im_h, im_w = im.shape[:2]
    # make the short size equal to min_size but also the long size no bigger than max_size
    scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h,
                max_size / im_w)

    # resize and process the image
    new_h, new_w = int(scale * im_h), int(scale * im_w)
    im_resized = skimage.img_as_float(
        skimage.transform.resize(im, [new_h, new_w]))
    im_processed = im_resized * 255 - im_mean
    im_batch = im_processed[np.newaxis, ...].astype(np.float32)

    # Sample one qa pair from all QA pairs
    qa_pairs = iminfo['processed_qa_pairs']
    num_questions = len(qa_pairs)
    num_choices = 4
    text_seq_batch = np.zeros((T, num_questions * num_choices), dtype=np.int32)
    label_batch = np.zeros(num_questions, dtype=np.int32)
    bboxes = np.zeros((num_questions * num_choices, 4), np.float32)
    for n_q in range(num_questions):
        this_bboxes, question, label = qa_pairs[n_q]
        bboxes[n_q * num_choices:(n_q + 1) * num_choices, :] = this_bboxes
        text_seq_batch[:, n_q*num_choices:(n_q+1)*num_choices] = \
            np.array(text_processing.preprocess_sentence(question, vocab_dict, T)).reshape((T, 1))
        label_batch[n_q] = label

    # annotate regions
    bboxes *= scale
    bboxes = im_processing.rectify_bboxes(bboxes, height=new_h, width=new_w)
    spatial_batch = spatial_feature_from_bbox(bboxes, im_h=new_h, im_w=new_w)

    # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
    bbox_batch = np.zeros((len(bboxes), 5), np.float32)
    bbox_batch[:, 1:5] = bboxes

    batch = dict(im_batch=im_batch,
                 bbox_batch=bbox_batch,
                 spatial_batch=spatial_batch,
                 text_seq_batch=text_seq_batch,
                 label_batch=label_batch)

    return batch
예제 #5
0
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T,
                   max_bbox_num, max_rel_num):
    im_path = iminfo['im_path']
    im = skimage.io.imread(im_path)
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))

    # calculate the resize scaling factor
    im_h, im_w = im.shape[:2]
    # make the short size equal to min_size but also the long size no bigger than max_size
    scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h,
                max_size / im_w)

    # resize and process the image
    new_h, new_w = int(scale * im_h), int(scale * im_w)
    im_resized = skimage.img_as_float(
        skimage.transform.resize(im, [new_h, new_w]))
    im_processed = im_resized * 255 - im_mean
    im_batch = im_processed[np.newaxis, ...].astype(np.float32)

    # annotate regions
    bboxes = np.array(iminfo['bboxes'], np.float32)
    bboxes = bboxes[:max_bbox_num]
    if len(bboxes) == 0:
        raise IOError('no object annotations for image ' + im_path)
    bboxes *= scale
    bboxes = im_processing.rectify_bboxes(bboxes, height=new_h, width=new_w)
    spatial_batch = spatial_feature_from_bbox(bboxes, im_h=new_h, im_w=new_w)

    # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
    bbox_batch = np.zeros((len(bboxes), 5), np.float32)
    bbox_batch[:, 1:5] = bboxes

    # Randomly pick one relationship from all the relationships in this image
    mapped_rels = iminfo['mapped_rels']
    if len(mapped_rels) == 0:
        raise IOError('no relationship annotations for image ' + im_path)
    # Prune the relationships to avoid objects out of max_bbox_num
    if len(iminfo['bboxes']) > max_bbox_num:
        mapped_rels = [
            rel for rel in mapped_rels if max(rel[0], rel[1]) < max_bbox_num
        ]
    if len(mapped_rels) > max_rel_num:
        mapped_rels = [
            mapped_rels[n]
            for n in np.random.choice(len(mapped_rels), max_rel_num)
        ]
    num_rels = len(mapped_rels)
    if num_rels == 0:
        raise IOError('no relationship annotations for image ' + im_path)

    expr_obj1_batch = np.zeros((T, num_rels), dtype=np.int32)
    expr_obj2_batch = np.zeros((T, num_rels), dtype=np.int32)
    expr_relation_batch = np.zeros((T, num_rels), dtype=np.int32)
    text_seq_batch = np.zeros((T, num_rels), dtype=np.int32)
    label_batch = np.zeros(num_rels, dtype=np.int32)
    label_weak_batch = np.zeros(num_rels, dtype=np.int32)
    label_weak_obj2_batch = np.zeros(num_rels, dtype=np.int32)
    questions = [None for _ in range(num_rels)]
    obj1_component_idx = np.zeros((T, num_rels), np.bool)
    obj2_component_idx = np.zeros((T, num_rels), np.bool)
    rel_component_idx = np.zeros((T, num_rels), np.bool)
    for n_rel in range(num_rels):
        obj1_idx, obj2_idx, obj1_name, predcate_name, obj2_name = mapped_rels[
            n_rel]
        question = obj1_name + ' ' + predcate_name + ' ' + obj2_name

        vocabidx_obj1 = text_processing.sentence2vocab_indices(
            obj1_name, vocab_dict)
        vocabidx_obj2 = text_processing.sentence2vocab_indices(
            obj2_name, vocab_dict)
        vocabidx_predcate = text_processing.sentence2vocab_indices(
            predcate_name, vocab_dict)

        expr_obj1_batch[:, n_rel] = text_processing.preprocess_vocab_indices(
            vocabidx_obj1, vocab_dict, T)
        expr_obj2_batch[:, n_rel] = text_processing.preprocess_vocab_indices(
            vocabidx_obj2, vocab_dict, T)
        expr_relation_batch[:,
                            n_rel] = text_processing.preprocess_vocab_indices(
                                vocabidx_predcate, vocab_dict, T)
        text_seq_batch[:, n_rel] = text_processing.preprocess_vocab_indices(
            vocabidx_obj1 + vocabidx_predcate + vocabidx_obj2, vocab_dict, T)

        l_obj1, l_obj2, l_rel = len(vocabidx_obj1), len(vocabidx_obj2), len(
            vocabidx_predcate)
        obj1_component_idx[-l_obj1 - l_rel - l_obj2:-l_rel - l_obj2,
                           n_rel] = True
        rel_component_idx[-l_rel - l_obj2:-l_obj2, n_rel] = True
        obj2_component_idx[-l_obj2:, n_rel] = True

        label_batch[n_rel] = obj1_idx * bbox_batch.shape[0] + obj2_idx
        label_weak_batch[n_rel] = obj1_idx
        label_weak_obj2_batch[n_rel] = obj2_idx
        questions[n_rel] = question

    batch = dict(im_batch=im_batch,
                 bbox_batch=bbox_batch,
                 spatial_batch=spatial_batch,
                 expr_obj1_batch=expr_obj1_batch,
                 expr_obj2_batch=expr_obj2_batch,
                 expr_relation_batch=expr_relation_batch,
                 text_seq_batch=text_seq_batch,
                 label_weak_batch=label_weak_batch,
                 label_weak_obj2_batch=label_weak_obj2_batch,
                 label_batch=label_batch,
                 questions=questions,
                 obj1_component_idx=obj1_component_idx,
                 obj2_component_idx=obj2_component_idx,
                 rel_component_idx=rel_component_idx)

    return batch