def get_streams(self): ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index] if image_id in self.dataset.imgs_with_errors: line = EOS_IDENTIFIER stream = get_encoded_line(line, self.vocabulary) # Assumes stream has EOS word at the end assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER]) stream = stream[:-1] filtered_stream = [] for word in stream: if word != self.vocabulary[UNK_IDENTIFIER]: filtered_stream.append(word) stream = filtered_stream if self.truncate and len(stream) >= self.max_words: stream = stream[:self.max_words - 1] self.num_truncates += 1 pad = self.max_words - (len(stream) + 1) if self.pad else 0 if pad > 0: self.num_pads += 1 out = {} out['timestep_input'] = np.asarray([self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad, float) out['timestep_cont'] = np.asarray([0] + [1] * len(stream) + [0] * pad, float) out['timestep_target'] = np.asarray( stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, float) # Write image features to batch img_info = self.dataset.loadImgs(image_id)[0] img_wd = float(img_info['width']) img_ht = float(img_info['height']) out['fc7_img'] = self.dataset.image_features[str( (image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0] assert (object_id_list[0] == -1) object_id = object_id_list[1] bbox = self.dataset.loadAnns(object_id)[0]['bbox'] out['fc7_obj'] = self.dataset.image_features[str((image_id, bbox))][0] bbox_area_ratio = (bbox[2] * bbox[3]) / (img_wd * img_ht) bbox_x1y1x2y2 = [ bbox[0] / img_wd, bbox[1] / img_ht, (bbox[0] + bbox[2]) / img_wd, (bbox[1] + bbox[3]) / img_ht ] bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio] out['bbox_features'] = bbox_features self.num_outs += 1 self.next_line() return out
def get_streams(self): ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index] if image_id in self.dataset.imgs_with_errors: line = EOS_IDENTIFIER stream = get_encoded_line(line, self.vocabulary) # Assumes stream has EOS word at the end assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER]) stream = stream[:-1] filtered_stream = [] for word in stream: if word != self.vocabulary[UNK_IDENTIFIER]: filtered_stream.append(word) stream = filtered_stream if self.truncate and len(stream) >= self.max_words: stream = stream[:self.max_words-1] self.num_truncates += 1 pad = self.max_words - (len(stream) + 1) if self.pad else 0 if pad > 0: self.num_pads += 1 out = {} out['timestep_input'] = np.asarray([self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad, float) out['timestep_cont'] = np.asarray([0] + [1] * len(stream) + [0] * pad, float) out['timestep_target'] = np.asarray(stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, float) # Write image features to batch img_info = self.dataset.loadImgs(image_id)[0] img_wd = float(img_info['width']) img_ht = float(img_info['height']) out['fc7_img'] = self.dataset.image_features[str((image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0] assert(object_id_list[0]==-1) object_id = object_id_list[1] bbox = self.dataset.loadAnns(object_id)[0]['bbox'] out['fc7_obj'] = self.dataset.image_features[str((image_id, bbox))][0] bbox_area_ratio = (bbox[2] * bbox[3]) / (img_wd * img_ht) bbox_x1y1x2y2 = [bbox[0] / img_wd, bbox[1] / img_ht, (bbox[0] + bbox[2]) / img_wd, (bbox[1] + bbox[3]) / img_ht] bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio] out['bbox_features'] = bbox_features self.num_outs += 1 self.next_line() return out
def comprehension_experiment(self, experiment_paths, proposal_source='gt', visualize=False, eval_method=None): output_h5_file = '%s/COCO_region_features.h5' % experiment_paths.precomputed_image_features self.extract_image_features(experiment_paths, proposal_source, output_h5_file) h5file = h5py.File(output_h5_file, 'r') num_images = len(self.images) random.seed() random.shuffle(self.images) results = [] for (i,image_id) in enumerate(self.images): image = self.dataset.loadImgs(image_id)[0] if proposal_source != 'gt': bboxes = [cand['bounding_box'] for cand in image['region_candidates']] else: obj_anns = self.dataset.coco.imgToAnns[image_id] bboxes = [ann['bbox'] for ann in obj_anns] if len(bboxes) == 0: print("No region candidates for %d" % image_id) anns = self.dataset.img_to_refexps[image_id] for ann in anns: gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1 result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':[], 'refexp':ann['refexp'][0]} results.append(result) continue # Object region features for obj_i in range(len(bboxes)): feats = h5file[str((image_id,bboxes[obj_i]))][:] if obj_i == 0: obj_feats = feats else: obj_feats = np.vstack((obj_feats, feats)) # Image region features img_wd = int(image['width']) img_ht = int(image['height']) img_feats = h5file[str((image_id,[0,0,img_wd-1,img_ht-1]))][:] img_feats = np.tile(img_feats,(len(obj_feats),1)) # Bounding box features bbox_features = [] for bbox in bboxes: img_wd = float(img_wd) img_ht = float(img_ht) bbox_area_ratio = (bbox[2]*bbox[3])/(img_wd*img_ht) bbox_x1y1x2y2 = [bbox[0]/img_wd, bbox[1]/img_ht, min(1., (bbox[0]+bbox[2])/img_wd), min(1., (bbox[1]+bbox[3])/img_ht)] bbox_features.append(bbox_x1y1x2y2 + [bbox_area_ratio]) anns = self.dataset.img_to_refexps[image_id] for ann in anns: prefix_words_unfiltered = get_encoded_line(ann['refexp'], self.lang_model.vocab) prefix_words = [] for word in prefix_words_unfiltered: if word != self.lang_model.vocab[UNK_IDENTIFIER]: prefix_words.append(word) prefix_words = [prefix_words] * len(bboxes) output_captions, output_probs = self.lang_model.sample_captions(obj_feats, img_feats, bbox_features, prefix_words=prefix_words) stats = [gen_stats(output_prob) for output_prob in output_probs] stats = [stat['log_p_word'] for stat in stats] (sort_keys, sorted_stats) = zip(*sorted(enumerate(stats), key=lambda x:-x[1])) top_k = 10 if len(sort_keys) > 10 else len(sort_keys) top_bboxes = [bboxes[k] for k in sort_keys[:top_k]] gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1 result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':top_bboxes, 'refexp':ann['refexp']} if visualize: img_filename = '%s/%s' % (self.dataset.image_root, self.dataset.loadImgs(image_id)[0]['file_name']) im = mpimg.imread(img_filename) plt.cla() plt.imshow(im) plt.axis('off') plt.title(ann['refexp']) if gt_obj != -1: gt_box = self.dataset.coco.loadAnns(gt_obj)[0]['bbox'] plt.gca().add_patch(plt.Rectangle((gt_box[0], gt_box[1]),gt_box[2], gt_box[3], fill=False, edgecolor='g', linewidth=3)) top_box = top_bboxes[0] plt.gca().add_patch(plt.Rectangle((top_box[0], top_box[1]),top_box[2], top_box[3], fill=False, edgecolor='r', linewidth=3)) #top_box_score = stats[bboxes.index(top_box)] #plt.text(top_box[0], top_box[1], str(top_box_score), fontsize=12, bbox=dict(facecolor='red', alpha=1)) ipdb.set_trace() results.append(result) sys.stdout.write("\rDone with %d/%d images" % (i+1,num_images)) sys.stdout.flush() sys.stdout.write("\n") h5file.close() return results
def comprehension_experiment(self, experiment_paths, proposal_source='gt', visualize=False, eval_method=None): output_h5_file = '%s/COCO_region_features.h5' % experiment_paths.precomputed_image_features self.extract_image_features(experiment_paths, proposal_source, output_h5_file) h5file = h5py.File(output_h5_file, 'r') if eval_method is None: eval_methods = ['noisy_or', 'max', 'image_context_only'] else: eval_methods = [eval_method] results = defaultdict(list) num_images = len(self.images) random.seed() random.shuffle(self.images) for (i, image_id) in enumerate(self.images): image = self.dataset.loadImgs(image_id)[0] if proposal_source != 'gt': bboxes = [cand['bounding_box'] for cand in image['region_candidates']] else: anns = self.dataset.coco.imgToAnns[image_id] bboxes = [ann['bbox'] for ann in anns] if len(bboxes) == 0: print("No region candidates for %d" % image_id) anns = self.dataset.img_to_refexps[image_id] for ann in anns: gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1 result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':[], 'refexp':ann['refexp']} for method in eval_methods: results[method].append(result) continue # Image region features batch_size = len(bboxes) img_wd = int(image['width']) img_ht = int(image['height']) fc7_img = h5file[str((image_id,[0,0,img_wd-1,img_ht-1]))][:] img_wd = float(img_wd) img_ht = float(img_ht) image_feature_length = len(fc7_img[0]) # Any change to context_length value will also require a change in the deploy prototxt context_length = 10 fc7_obj = np.zeros((batch_size,context_length,image_feature_length)) context_fc7 = np.tile(fc7_img,(batch_size,context_length,1)) bbox_features = np.zeros((batch_size,context_length,5)) context_bbox_features = np.zeros((batch_size,context_length, 5),np.float16) context_bboxes = [] for (bbox_idx, bbox) in enumerate(bboxes): # Object region features fc7_obj[bbox_idx,:] = h5file[str((image_id,bbox))][:] # Bounding box features bbox_area_ratio = (bbox[2]*bbox[3])/(img_wd*img_ht) bbox_x1y1x2y2 = [bbox[0]/img_wd, bbox[1]/img_ht, min(1., (bbox[0]+bbox[2])/img_wd), min(1., (bbox[1]+bbox[3])/img_ht)] obj_bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio] bbox_features[bbox_idx,:] = obj_bbox_features context_bbox_features[bbox_idx,:] = [0,0,1,1,1] # Context features other_bboxes = list(bboxes) # make a copy other_bboxes.remove(bbox) if len(other_bboxes) > context_length-1: rand_sample = sorted(random.sample(range(len(other_bboxes)),context_length-1)) other_bboxes = [other_bboxes[idx] for idx in rand_sample] context_bboxes.append(other_bboxes) for (other_bbox_idx, other_bbox) in enumerate(other_bboxes): other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht) other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht, (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht] other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio] feats = h5file[str((image_id,other_bbox))][:] context_fc7[bbox_idx,other_bbox_idx,:] = feats context_bbox_features[bbox_idx,other_bbox_idx,:] = other_bbox_features for elem in context_bboxes: elem.append([0,0,img_wd-1,img_ht-1]) anns = self.dataset.img_to_refexps[image_id] for ann in anns: prefix_words_unfiltered = get_encoded_line(ann['refexp'], self.lang_model.vocab) prefix_words = [] for word in prefix_words_unfiltered: if word != self.lang_model.vocab[UNK_IDENTIFIER]: prefix_words.append(word) prefix_words = [prefix_words] * batch_size output_captions, output_probs, \ output_all_probs = self.lang_model.sample_captions_with_context(fc7_obj, bbox_features, context_fc7, context_bbox_features, prefix_words=prefix_words) all_stats = [gen_stats(output_prob) for output_prob in output_all_probs] all_stats_p_word = [stat['p_word'] for stat in all_stats] all_stats_p_word = np.reshape(all_stats_p_word, (batch_size, context_length)) for method in eval_methods: if method == 'noisy_or': num_context_objs = min(context_length-1,len(bboxes)-1) sort_all_stats_p_word = -np.sort(-all_stats_p_word[:,0:num_context_objs]) top_all_stats_p_word = np.hstack((sort_all_stats_p_word,all_stats_p_word[:,-1:])) stats = (1 - np.product(1-top_all_stats_p_word,axis=1)) elif method == 'image_context_only': stats = all_stats_p_word[:,-1] elif method == 'max': stats = np.max(all_stats_p_word,axis=1) else: raise StandardError("Unknown eval method %s" % method) (sort_keys, sorted_stats) = zip(*sorted(enumerate(stats), key=lambda x:-x[1])) top_k = 10 if len(sort_keys) > 10 else len(sort_keys) top_bboxes = [bboxes[k] for k in sort_keys[:top_k]] gt_obj = ann['object_id_list'][1] if len(ann['object_id_list']) == 2 else -1 result = {'annotation_id':gt_obj, 'predicted_bounding_boxes':top_bboxes, 'refexp':ann['refexp']} results[method].append(result) gt_box = self.dataset.coco.loadAnns(gt_obj)[0]['bbox'] if method == 'noisy_or': noisy_or_top_box = top_bboxes[0] elif method == "image_context_only": image_top_bbox = top_bboxes[0] if visualize: print "Image id: %d" % image_id img_filename = '%s/%s' % (self.dataset.image_root, self.dataset.loadImgs(image_id)[0]['file_name']) im = mpimg.imread(img_filename) if noisy_or_top_box: plt.figure(1) plt.cla() plt.imshow(im) plt.title(ann['refexp']) top_box = noisy_or_top_box top_box_ind = bboxes.index(top_box) plt.gca().add_patch(plt.Rectangle((top_box[0], top_box[1]),top_box[2], top_box[3], fill=False, edgecolor='b', linewidth=6)) top_context_box_ind = np.argmax(all_stats_p_word[top_box_ind]) top_context_box = context_bboxes[top_box_ind][top_context_box_ind] plt.gca().add_patch(plt.Rectangle((top_context_box[0], top_context_box[1]),top_context_box[2], top_context_box[3], fill=False, edgecolor='b', linewidth=6, linestyle='dashed')) plt.axis('off') if image_top_bbox: plt.figure(2) plt.cla() plt.imshow(im) plt.title(ann['refexp']) top_box = image_top_bbox plt.gca().add_patch(plt.Rectangle((top_box[0], top_box[1]),top_box[2], top_box[3], fill=False, edgecolor='b', linewidth=6)) plt.axis('off') plt.figure(3) plt.cla() plt.imshow(im) plt.title(ann['refexp']) plt.gca().add_patch(plt.Rectangle((gt_box[0], gt_box[1]),gt_box[2], gt_box[3], fill=False, edgecolor='g', linewidth=6)) plt.axis('off') while True: sys.stdout.write('Do you want to save? (y/n): ') choice = raw_input().lower() if choice.startswith('y'): plt.figure(1) plt.savefig('%s/%d_nor.png' % (experiment_paths.coco_path, image_id),bbox_inches='tight') plt.figure(2) plt.savefig('%s/%d_image_context.png' % (experiment_paths.coco_path, image_id),bbox_inches='tight') plt.figure(3) plt.savefig('%s/%d_gt.png' % (experiment_paths.coco_path, image_id),bbox_inches='tight') break elif choice.startswith('n'): break ipdb.set_trace() sys.stdout.write("\rDone with %d/%d images" % (i+1,num_images)) sys.stdout.flush() sys.stdout.write("\n") h5file.close() return results
def get_streams(self): ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index] if image_id in self.dataset.imgs_with_errors: line = EOS_IDENTIFIER stream = get_encoded_line(line, self.vocabulary) # Assumes stream has EOS word at the end assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER]) stream = stream[:-1] filtered_stream = [] for word in stream: if word != self.vocabulary[UNK_IDENTIFIER]: filtered_stream.append(word) stream = filtered_stream if self.truncate and len(stream) >= self.max_words: stream = stream[:self.max_words - 1] self.num_truncates += 1 object_id = object_id_list[1] object_ann = self.dataset.loadAnns(object_id)[0] object_category = self.dataset.loadCats( object_ann['category_id'])[0]['name'] object_bbox = self.dataset.loadAnns(object_id)[0]['bbox'] context_anns_of_same_category = [] context_anns_of_diff_category = [] if hasattr(self.dataset, 'coco'): all_anns = self.dataset.coco.imgToAnns[image_id] else: all_anns = self.dataset.imgToAnns[image_id] for ann in all_anns: if ann['id'] != object_id: if ann['category_id'] == object_ann['category_id']: context_anns_of_same_category.append(ann) else: context_anns_of_diff_category.append(ann) neg_anns_of_same_category = [] neg_anns_of_diff_category = [] if self.neg_proposal_source != 'gt': image_info = self.dataset.loadImgs(image_id)[0] all_anns = image_info['region_candidates'] for ann in all_anns: ann['bbox'] = ann['bounding_box'] ann_box = ann['bbox'] iou = iou_bboxes(ann_box, object_bbox) if iou < 0.5 and ann[ 'predicted_object_name'] == object_category: neg_anns_of_same_category.append(ann) elif ann['predicted_object_name'] != object_category: neg_anns_of_diff_category.append(ann) else: neg_anns_of_same_category = context_anns_of_same_category neg_anns_of_diff_category = context_anns_of_diff_category # subtract one because image is reserved as one context region if len(context_anns_of_same_category) > self.max_num_context - 1: rand_sample = sorted( random.sample(range(len(context_anns_of_same_category)), self.max_num_context - 1)) context_anns_of_same_category = [ context_anns_of_same_category[idx] for idx in rand_sample ] elif len(context_anns_of_same_category) < self.max_num_context - 1: rand_sample = sorted( random.sample( range(len(context_anns_of_diff_category)), min( self.max_num_context - 1 - len(context_anns_of_same_category), len(context_anns_of_diff_category)))) context_anns_of_same_category += [ context_anns_of_diff_category[idx] for idx in rand_sample ] if len(neg_anns_of_same_category) > self.max_num_negatives: rand_sample = sorted( random.sample(range(len(neg_anns_of_same_category)), self.max_num_negatives)) neg_anns_of_same_category = [ neg_anns_of_same_category[idx] for idx in rand_sample ] elif len(neg_anns_of_same_category) < self.max_num_negatives: rand_sample = sorted( random.sample( range(len(neg_anns_of_diff_category)), min( self.max_num_negatives - len(neg_anns_of_same_category), len(neg_anns_of_diff_category)))) neg_anns_of_same_category += [ neg_anns_of_diff_category[idx] for idx in rand_sample ] # If we are running short of proposal negatives, sample from gt negatives if len( neg_anns_of_same_category ) < self.max_num_negatives and self.neg_proposal_source != 'gt': rand_sample = sorted( random.sample( range(len(context_anns_of_diff_category)), min( self.max_num_negatives - len(neg_anns_of_same_category), len(context_anns_of_diff_category)))) neg_anns_of_same_category += [ context_anns_of_diff_category[idx] for idx in rand_sample ] pad = self.max_words - (len(stream) + 1) if self.pad else 0 if pad > 0: self.num_pads += 1 out = {} timestep_input = np.asarray( [[self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad], np.float16) out['timestep_input'] = np.tile(timestep_input.T, (1, self.max_num_context)) timestep_cont = np.asarray([[0] + [1] * len(stream) + [0] * pad], np.float16) out['timestep_cont'] = np.tile(timestep_cont.T, (1, self.max_num_context)) timestep_target = np.asarray( stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, np.float16) out['timestep_target'] = timestep_target self.swap_axis_streams.add('timestep_input') self.swap_axis_streams.add('timestep_target') self.swap_axis_streams.add('timestep_cont') # Write image features to batch img_info = self.dataset.loadImgs(image_id)[0] img_wd = float(img_info['width']) img_ht = float(img_info['height']) assert (len(object_id_list) <= 2) fc7_img = self.dataset.image_features[str( (image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0] out['fc7_img'] = np.tile(fc7_img, (self.max_num_context, 1)) img_bbox_features = np.zeros((self.max_num_context, 5), np.float16) img_bbox_features[:] = [0, 0, 1, 1, 1] out['img_bbox_features'] = img_bbox_features # Write object region features to batch object_bbox = self.dataset.loadAnns(object_id)[0]['bbox'] fc7_obj = self.dataset.image_features[str((image_id, object_bbox))][0] out['fc7_obj'] = np.tile(fc7_obj, (self.max_num_context, 1)) bbox_area_ratio = (object_bbox[2] * object_bbox[3]) / (img_wd * img_ht) bbox_x1y1x2y2 = [ object_bbox[0] / img_wd, object_bbox[1] / img_ht, (object_bbox[0] + object_bbox[2]) / img_wd, (object_bbox[1] + object_bbox[3]) / img_ht ] bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio] out['bbox_features'] = np.tile(bbox_features, (self.max_num_context, 1)) # Write context features to batch context_fc7 = np.tile(fc7_img, (self.max_num_context, 1)) context_bbox_features = np.zeros((self.max_num_context, 5), np.float16) context_bbox_features[:] = [0, 0, 1, 1, 1] if len(context_anns_of_same_category) > 0: other_bboxes = [ ann['bbox'] for ann in context_anns_of_same_category ] for idx, other_bbox in enumerate(other_bboxes): other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht) other_bbox_x1y1x2y2 = [ other_bbox[0] / img_wd, other_bbox[1] / img_ht, (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht ] other_bbox_features = other_bbox_x1y1x2y2 + [ other_bbox_area_ratio ] context_fc7[idx, :] = self.dataset.image_features[str( (image_id, other_bbox))][0] context_bbox_features[idx, :] = other_bbox_features out['context_fc7'] = context_fc7 out['context_bbox_features'] = context_bbox_features # Write negative features to batch negative_fc7 = np.zeros( (self.max_num_negatives, self.dataset.image_feature_length), np.float16) negative_bbox_features = np.zeros((self.max_num_negatives, 5), np.float16) if len(neg_anns_of_same_category) > 0: other_bboxes = [ann['bbox'] for ann in neg_anns_of_same_category] for idx, other_bbox in enumerate(other_bboxes): other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht) other_bbox_x1y1x2y2 = [ other_bbox[0] / img_wd, other_bbox[1] / img_ht, (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht ] other_bbox_features = other_bbox_x1y1x2y2 + [ other_bbox_area_ratio ] negative_fc7[idx, :] = self.dataset.image_features[str( (image_id, other_bbox))][0] negative_bbox_features[idx, :] = other_bbox_features out['negative_fc7'] = negative_fc7 out['negative_bbox_features'] = negative_bbox_features pairwise_similarity = np.asarray([[0] * self.max_num_negatives], np.float16) out['pairwise_similarity'] = np.tile(pairwise_similarity, (self.max_words, 1)) self.swap_axis_streams.add('pairwise_similarity') self.num_outs += 1 self.next_line() return out
def get_streams(self): ((image_filename, image_id), object_id_list, line) = self.image_refexp_pairs[self.index] if image_id in self.dataset.imgs_with_errors: line = EOS_IDENTIFIER stream = get_encoded_line(line, self.vocabulary) # Assumes stream has EOS word at the end assert (stream[-1] == self.vocabulary[EOS_IDENTIFIER]) stream = stream[:-1] filtered_stream = [] for word in stream: if word != self.vocabulary[UNK_IDENTIFIER]: filtered_stream.append(word) stream = filtered_stream if self.truncate and len(stream) >= self.max_words: stream = stream[:self.max_words-1] self.num_truncates += 1 object_id = object_id_list[1] object_ann = self.dataset.loadAnns(object_id)[0] object_category = self.dataset.loadCats(object_ann['category_id'])[0]['name'] object_bbox = self.dataset.loadAnns(object_id)[0]['bbox'] context_anns_of_same_category = [] context_anns_of_diff_category = [] if hasattr(self.dataset, 'coco'): all_anns = self.dataset.coco.imgToAnns[image_id] else: all_anns = self.dataset.imgToAnns[image_id] for ann in all_anns: if ann['id'] != object_id: if ann['category_id'] == object_ann['category_id']: context_anns_of_same_category.append(ann) else: context_anns_of_diff_category.append(ann) neg_anns_of_same_category = [] neg_anns_of_diff_category = [] if self.neg_proposal_source != 'gt': image_info = self.dataset.loadImgs(image_id)[0] all_anns = image_info['region_candidates'] for ann in all_anns: ann['bbox'] = ann['bounding_box'] ann_box = ann['bbox'] iou = iou_bboxes(ann_box, object_bbox) if iou < 0.5 and ann['predicted_object_name'] == object_category: neg_anns_of_same_category.append(ann) elif ann['predicted_object_name'] != object_category: neg_anns_of_diff_category.append(ann) else: neg_anns_of_same_category = context_anns_of_same_category neg_anns_of_diff_category = context_anns_of_diff_category # subtract one because image is reserved as one context region if len(context_anns_of_same_category) > self.max_num_context-1: rand_sample = sorted(random.sample(range(len(context_anns_of_same_category)), self.max_num_context - 1)) context_anns_of_same_category = [context_anns_of_same_category[idx] for idx in rand_sample] elif len(context_anns_of_same_category) < self.max_num_context-1: rand_sample = sorted(random.sample(range(len(context_anns_of_diff_category)), min(self.max_num_context - 1 - len(context_anns_of_same_category), len(context_anns_of_diff_category)))) context_anns_of_same_category += [context_anns_of_diff_category[idx] for idx in rand_sample] if len(neg_anns_of_same_category) > self.max_num_negatives: rand_sample = sorted(random.sample(range(len(neg_anns_of_same_category)),self.max_num_negatives)) neg_anns_of_same_category = [neg_anns_of_same_category[idx] for idx in rand_sample] elif len(neg_anns_of_same_category) < self.max_num_negatives: rand_sample = sorted(random.sample(range(len(neg_anns_of_diff_category)), min(self.max_num_negatives-len(neg_anns_of_same_category), len(neg_anns_of_diff_category)))) neg_anns_of_same_category += [neg_anns_of_diff_category[idx] for idx in rand_sample] # If we are running short of proposal negatives, sample from gt negatives if len(neg_anns_of_same_category) < self.max_num_negatives and self.neg_proposal_source != 'gt': rand_sample = sorted(random.sample(range(len(context_anns_of_diff_category)), min(self.max_num_negatives-len(neg_anns_of_same_category), len(context_anns_of_diff_category)))) neg_anns_of_same_category += [context_anns_of_diff_category[idx] for idx in rand_sample] pad = self.max_words - (len(stream) + 1) if self.pad else 0 if pad > 0: self.num_pads += 1 out = {} timestep_input = np.asarray([[self.vocabulary[EOS_IDENTIFIER]] + stream + [-1] * pad], np.float16) out['timestep_input'] = np.tile(timestep_input.T, (1,self.max_num_context)) timestep_cont = np.asarray([[0] + [1] * len(stream) + [0] * pad], np.float16) out['timestep_cont'] = np.tile(timestep_cont.T, (1,self.max_num_context)) timestep_target = np.asarray(stream + [self.vocabulary[EOS_IDENTIFIER]] + [-1] * pad, np.float16) out['timestep_target'] = timestep_target self.swap_axis_streams.add('timestep_input') self.swap_axis_streams.add('timestep_target') self.swap_axis_streams.add('timestep_cont') # Write image features to batch img_info = self.dataset.loadImgs(image_id)[0] img_wd = float(img_info['width']) img_ht = float(img_info['height']) assert(len(object_id_list) <= 2) fc7_img = self.dataset.image_features[str((image_id, [0, 0, int(img_wd - 1), int(img_ht - 1)]))][0] out['fc7_img'] = np.tile(fc7_img, (self.max_num_context, 1)) img_bbox_features = np.zeros((self.max_num_context, 5), np.float16) img_bbox_features[:] = [0,0,1,1,1] out['img_bbox_features'] = img_bbox_features # Write object region features to batch object_bbox = self.dataset.loadAnns(object_id)[0]['bbox'] fc7_obj = self.dataset.image_features[str((image_id, object_bbox))][0] out['fc7_obj'] = np.tile(fc7_obj, (self.max_num_context, 1)) bbox_area_ratio = (object_bbox[2] * object_bbox[3]) / (img_wd * img_ht) bbox_x1y1x2y2 = [object_bbox[0] / img_wd, object_bbox[1] / img_ht, (object_bbox[0] + object_bbox[2]) / img_wd, (object_bbox[1] + object_bbox[3]) / img_ht] bbox_features = bbox_x1y1x2y2 + [bbox_area_ratio] out['bbox_features'] = np.tile(bbox_features, (self.max_num_context, 1)) # Write context features to batch context_fc7 = np.tile(fc7_img, (self.max_num_context, 1)) context_bbox_features = np.zeros((self.max_num_context, 5), np.float16) context_bbox_features[:] = [0,0,1,1,1] if len(context_anns_of_same_category) > 0: other_bboxes = [ann['bbox'] for ann in context_anns_of_same_category] for idx, other_bbox in enumerate(other_bboxes): other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht) other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht, (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht] other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio] context_fc7[idx,:] = self.dataset.image_features[str((image_id, other_bbox))][0] context_bbox_features[idx,:] = other_bbox_features out['context_fc7'] = context_fc7 out['context_bbox_features'] = context_bbox_features # Write negative features to batch negative_fc7 = np.zeros((self.max_num_negatives, self.dataset.image_feature_length),np.float16) negative_bbox_features = np.zeros((self.max_num_negatives, 5),np.float16) if len(neg_anns_of_same_category) > 0: other_bboxes = [ann['bbox'] for ann in neg_anns_of_same_category] for idx, other_bbox in enumerate(other_bboxes): other_bbox_area_ratio = (other_bbox[2] * other_bbox[3]) / (img_wd * img_ht) other_bbox_x1y1x2y2 = [other_bbox[0] / img_wd, other_bbox[1] / img_ht, (other_bbox[0] + other_bbox[2]) / img_wd, (other_bbox[1] + other_bbox[3]) / img_ht] other_bbox_features = other_bbox_x1y1x2y2 + [other_bbox_area_ratio] negative_fc7[idx,:] = self.dataset.image_features[str((image_id, other_bbox))][0] negative_bbox_features[idx,:] = other_bbox_features out['negative_fc7'] = negative_fc7 out['negative_bbox_features'] = negative_bbox_features pairwise_similarity = np.asarray([[0] * self.max_num_negatives], np.float16) out['pairwise_similarity'] = np.tile(pairwise_similarity, (self.max_words,1)) self.swap_axis_streams.add('pairwise_similarity') self.num_outs += 1 self.next_line() return out