def fetch_ann_spa_feat(self, ann_id, min_size=600, max_size=1000): # return x1, y1, x2, y2, area image_id = self.ann_to_image[ann_id] W, H = self.images[image_id]['width'], self.images[image_id]['height'] x1, y1, w, h = self.ann_to_box[ann_id] x2 = max(x1 + 1, x1 + w - 1) y2 = max(y1 + 1, y1 + h - 1) area = w * h # scale scale = min(max(min_size / H, min_size / W), max_size / H, max_size / W) new_h, new_w = int(scale * H), int(scale * W) region_bboxes = np.array([[x1, y1, x2, y2]], np.float32) * scale region_bboxes = im_processing.rectify_bboxes(region_bboxes, height=new_h, width=new_w) bbox_batch = np.zeros((len(region_bboxes), 5), np.float32) bbox_batch[:, 1:5] = region_bboxes spatial_batch = spatial_feature_from_bbox(region_bboxes, im_h=new_h, im_w=new_w) return spatial_batch[0]
def img_box_modify(self, im, box, min_size=600, max_size=1000): # box[x1, y1, w, h] -> [batch_index(0), x1, y1, x2, y2] # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R bbox_batch = np.zeros((len(box), 5), np.float32) bbox_batch[:, 1:5] = box bbox_batch[:, 3:5] = bbox_batch[:, 3:5] + bbox_batch[:, 1:3] - 1 # x2 = x1+w-1 # calculate the resize scaling factor im_h, im_w = im.shape[:2] # make the short size equal to min_size but also the long size no bigger than max_size scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h, max_size / im_w) # resize and process the image new_h, new_w = int(scale * im_h), int(scale * im_w) im_resized = skimage.img_as_float( skimage.transform.resize(im, [new_h, new_w])) im_processed = im_resized * 255 - im_mean im_batch = im_processed[np.newaxis, ...].astype(np.float32) # resize and process the box bbox_batch[:, 1:] = im_processing.rectify_bboxes(bbox_batch[:, 1:] * scale, height=new_h, width=new_w) return im_batch, bbox_batch
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T): im_path = iminfo['im_path'] im = skimage.io.imread(im_path) if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # calculate the resize scaling factor im_h, im_w = im.shape[:2] # make the short size equal to min_size but also the long size no bigger than max_size scale = min(max(min_size/im_h, min_size/im_w), max_size/im_h, max_size/im_w) # resize and process the image new_h, new_w = int(scale*im_h), int(scale*im_w) im_resized = skimage.img_as_float(skimage.transform.resize(im, [new_h, new_w])) im_processed = im_resized*255 - im_mean im_batch = im_processed[np.newaxis, ...].astype(np.float32) # annotate regions regions = iminfo['regions'] if len(regions) == 0: raise IOError('no region annotations for image ' + im_path) region_bboxes = np.array([ann[0] for ann in regions], np.float32) # save coco_bboxes, needed for evaluation code coco_bboxes = region_bboxes.copy() # back to [x, y, w, h] coco_bboxes[:, 2:4] = coco_bboxes[:, 2:4] - coco_bboxes[:, 0:2] + 1 region_bboxes *= scale region_bboxes = im_processing.rectify_bboxes(region_bboxes, height=new_h, width=new_w) # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R bbox_batch = np.zeros((len(region_bboxes), 5), np.float32) bbox_batch[:, 1:5] = region_bboxes spatial_batch = spatial_feature_from_bbox(region_bboxes, im_h=new_h, im_w=new_w) # a region may have zero, one or more sentence annotations # align language sequences with regions text_seq_batch = [] label_batch = [] coco_ann_ids = [] # needed for evaluation code questions = [] # needed for evaluation code for n in range(len(regions)): for n_s in range(len(regions[n][1])): s = regions[n][1][n_s] text_seq_batch.append(text_processing.preprocess_sentence(s, vocab_dict, T)) label_batch.append(n) coco_ann_ids.append(regions[n][2]) questions.append(s) text_seq_batch = np.array(text_seq_batch, dtype=np.int32).T label_batch = np.array(label_batch, dtype=np.int32) batch=dict(text_seq_batch=text_seq_batch, im_batch=im_batch, bbox_batch=bbox_batch, spatial_batch=spatial_batch, label_batch=label_batch, coco_ann_ids=coco_ann_ids, questions=questions, coco_bboxes=coco_bboxes) return batch
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T): im_path = iminfo['im_path'] im = skimage.io.imread(im_path) if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # calculate the resize scaling factor im_h, im_w = im.shape[:2] # make the short size equal to min_size but also the long size no bigger than max_size scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h, max_size / im_w) # resize and process the image new_h, new_w = int(scale * im_h), int(scale * im_w) im_resized = skimage.img_as_float( skimage.transform.resize(im, [new_h, new_w])) im_processed = im_resized * 255 - im_mean im_batch = im_processed[np.newaxis, ...].astype(np.float32) # Sample one qa pair from all QA pairs qa_pairs = iminfo['processed_qa_pairs'] num_questions = len(qa_pairs) num_choices = 4 text_seq_batch = np.zeros((T, num_questions * num_choices), dtype=np.int32) label_batch = np.zeros(num_questions, dtype=np.int32) bboxes = np.zeros((num_questions * num_choices, 4), np.float32) for n_q in range(num_questions): this_bboxes, question, label = qa_pairs[n_q] bboxes[n_q * num_choices:(n_q + 1) * num_choices, :] = this_bboxes text_seq_batch[:, n_q*num_choices:(n_q+1)*num_choices] = \ np.array(text_processing.preprocess_sentence(question, vocab_dict, T)).reshape((T, 1)) label_batch[n_q] = label # annotate regions bboxes *= scale bboxes = im_processing.rectify_bboxes(bboxes, height=new_h, width=new_w) spatial_batch = spatial_feature_from_bbox(bboxes, im_h=new_h, im_w=new_w) # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R bbox_batch = np.zeros((len(bboxes), 5), np.float32) bbox_batch[:, 1:5] = bboxes batch = dict(im_batch=im_batch, bbox_batch=bbox_batch, spatial_batch=spatial_batch, text_seq_batch=text_seq_batch, label_batch=label_batch) return batch
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T, max_bbox_num, max_rel_num): im_path = iminfo['im_path'] im = skimage.io.imread(im_path) if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # calculate the resize scaling factor im_h, im_w = im.shape[:2] # make the short size equal to min_size but also the long size no bigger than max_size scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h, max_size / im_w) # resize and process the image new_h, new_w = int(scale * im_h), int(scale * im_w) im_resized = skimage.img_as_float( skimage.transform.resize(im, [new_h, new_w])) im_processed = im_resized * 255 - im_mean im_batch = im_processed[np.newaxis, ...].astype(np.float32) # annotate regions bboxes = np.array(iminfo['bboxes'], np.float32) bboxes = bboxes[:max_bbox_num] if len(bboxes) == 0: raise IOError('no object annotations for image ' + im_path) bboxes *= scale bboxes = im_processing.rectify_bboxes(bboxes, height=new_h, width=new_w) spatial_batch = spatial_feature_from_bbox(bboxes, im_h=new_h, im_w=new_w) # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R bbox_batch = np.zeros((len(bboxes), 5), np.float32) bbox_batch[:, 1:5] = bboxes # Randomly pick one relationship from all the relationships in this image mapped_rels = iminfo['mapped_rels'] if len(mapped_rels) == 0: raise IOError('no relationship annotations for image ' + im_path) # Prune the relationships to avoid objects out of max_bbox_num if len(iminfo['bboxes']) > max_bbox_num: mapped_rels = [ rel for rel in mapped_rels if max(rel[0], rel[1]) < max_bbox_num ] if len(mapped_rels) > max_rel_num: mapped_rels = [ mapped_rels[n] for n in np.random.choice(len(mapped_rels), max_rel_num) ] num_rels = len(mapped_rels) if num_rels == 0: raise IOError('no relationship annotations for image ' + im_path) expr_obj1_batch = np.zeros((T, num_rels), dtype=np.int32) expr_obj2_batch = np.zeros((T, num_rels), dtype=np.int32) expr_relation_batch = np.zeros((T, num_rels), dtype=np.int32) text_seq_batch = np.zeros((T, num_rels), dtype=np.int32) label_batch = np.zeros(num_rels, dtype=np.int32) label_weak_batch = np.zeros(num_rels, dtype=np.int32) label_weak_obj2_batch = np.zeros(num_rels, dtype=np.int32) questions = [None for _ in range(num_rels)] obj1_component_idx = np.zeros((T, num_rels), np.bool) obj2_component_idx = np.zeros((T, num_rels), np.bool) rel_component_idx = np.zeros((T, num_rels), np.bool) for n_rel in range(num_rels): obj1_idx, obj2_idx, obj1_name, predcate_name, obj2_name = mapped_rels[ n_rel] question = obj1_name + ' ' + predcate_name + ' ' + obj2_name vocabidx_obj1 = text_processing.sentence2vocab_indices( obj1_name, vocab_dict) vocabidx_obj2 = text_processing.sentence2vocab_indices( obj2_name, vocab_dict) vocabidx_predcate = text_processing.sentence2vocab_indices( predcate_name, vocab_dict) expr_obj1_batch[:, n_rel] = text_processing.preprocess_vocab_indices( vocabidx_obj1, vocab_dict, T) expr_obj2_batch[:, n_rel] = text_processing.preprocess_vocab_indices( vocabidx_obj2, vocab_dict, T) expr_relation_batch[:, n_rel] = text_processing.preprocess_vocab_indices( vocabidx_predcate, vocab_dict, T) text_seq_batch[:, n_rel] = text_processing.preprocess_vocab_indices( vocabidx_obj1 + vocabidx_predcate + vocabidx_obj2, vocab_dict, T) l_obj1, l_obj2, l_rel = len(vocabidx_obj1), len(vocabidx_obj2), len( vocabidx_predcate) obj1_component_idx[-l_obj1 - l_rel - l_obj2:-l_rel - l_obj2, n_rel] = True rel_component_idx[-l_rel - l_obj2:-l_obj2, n_rel] = True obj2_component_idx[-l_obj2:, n_rel] = True label_batch[n_rel] = obj1_idx * bbox_batch.shape[0] + obj2_idx label_weak_batch[n_rel] = obj1_idx label_weak_obj2_batch[n_rel] = obj2_idx questions[n_rel] = question batch = dict(im_batch=im_batch, bbox_batch=bbox_batch, spatial_batch=spatial_batch, expr_obj1_batch=expr_obj1_batch, expr_obj2_batch=expr_obj2_batch, expr_relation_batch=expr_relation_batch, text_seq_batch=text_seq_batch, label_weak_batch=label_weak_batch, label_weak_obj2_batch=label_weak_obj2_batch, label_batch=label_batch, questions=questions, obj1_component_idx=obj1_component_idx, obj2_component_idx=obj2_component_idx, rel_component_idx=rel_component_idx) return batch