def __init__(self, experiment_paths, experiment_config, dataset, vocab=None, include_all_boxes=False): SequenceGenerator.__init__(self) self.exp_name = experiment_config.exp_name self.batch_num_streams = experiment_config.train.batch_size self.max_words = experiment_config.train.max_words self.pad = experiment_config.pad if hasattr(experiment_config, 'pad') else True self.truncate = experiment_config.truncate if hasattr(experiment_config, 'truncate') else True self.swap_axis_streams = frozenset(('timestep_input', 'timestep_cont', 'timestep_target')) self.index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.dataset = dataset self.image_refexp_pairs = self.dataset.image_refexp_pairs # Load image features if self.dataset.image_features is None: features_filename = "%s/COCO_region_features.h5" % experiment_paths.precomputed_image_features self.dataset.extract_image_object_features(features_filename, feature_layer='fc7', include_all_boxes=include_all_boxes) # Load vocab if vocab is not None: self.vocabulary_inverted = vocab self.vocabulary = {} for index, word in enumerate(self.vocabulary_inverted): self.vocabulary[word] = index else: self.init_vocabulary() # make the number of image/refexp pairs a multiple of the buffer size # so each timestep of each batch is useful and we can align the images align = experiment_config.aligned if hasattr(experiment_config, 'aligned') else True if align: num_pairs = len(self.image_refexp_pairs) remainder = num_pairs % self.batch_num_streams if remainder > 0: num_needed = self.batch_num_streams - remainder for i in range(num_needed): choice = random.randint(0, num_pairs - 1) self.image_refexp_pairs.append(self.image_refexp_pairs[choice]) assert len(self.image_refexp_pairs) % self.batch_num_streams == 0 shuffle = experiment_config.shuffle if hasattr(experiment_config, 'shuffle') else True if shuffle: random.shuffle(self.image_refexp_pairs)
def __init__(self, fsg_lines, vocab_filename, batch_num_streams=8, max_words=MAX_WORDS, pad=True, truncate=True): self.max_words = max_words self.lines = fsg_lines self.line_index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.vocabulary = {} self.vocabulary_inverted = [] self.vocab_counts = [] # initialize vocabulary self.init_vocabulary(vocab_filename) SequenceGenerator.__init__(self) self.batch_num_streams = batch_num_streams self.pad = pad self.truncate = truncate self.negative_one_padded_streams = frozenset(('target_sentence'))
def __init__(self, coco, split_name, batch_num_streams, image_root, vocab=None, max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True, pad=True, truncate=True, split_ids=None): #split_ids: image list e.g. '2801146217_03a0b59ccb.jpg\n', '1321723162_9d4c78b8af.jpg\n' # # self.max_words = max_words num_empty_lines = 0 self.images = [] num_total = 0 num_missing = 0 num_captions = 0 known_images = {} #self.coco = coco self.image_path_to_id = {} self.image_sentence_pairs = [] ### ## generate image_id list, which will be used in retrieval experiments ### if split_ids is None: split_ids = coco.imgs.keys() self.image_path_to_id = {} # pdb.set_trace() # print split_ids for image_id in split_ids: image_path = '%s/%s.jpg' % (image_root, image_id) # print 'image_info ',image_info #print 'image_path ',image_path self.image_path_to_id[image_path] = image_id #print self.image_path_to_id ###'./data/flickr8K/images/train/143688895_e837c3bc76.jpg': '143688895_e837c3bc76', self.image_sentence_pairs = split_image_captions( split_name, image_root) #print image_sentence_pairs #generate word vocabulary based on image caption sentences if vocab is None: self.init_vocabulary(self.image_sentence_pairs) else: self.vocabulary_inverted = vocab self.vocabulary = {} for index, word in enumerate(self.vocabulary_inverted): self.vocabulary[word] = index self.index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.image_list = [] SequenceGenerator.__init__(self) self.batch_num_streams = batch_num_streams # make the number of image/sentence pairs a multiple of the buffer size # so each timestep of each batch is useful and we can align the images if align: num_pairs = len(self.image_sentence_pairs) #pdb.set_trace() print 'number of pairs: ', num_pairs remainder = num_pairs % batch_num_streams if remainder > 0: num_needed = batch_num_streams - remainder for i in range(num_needed): choice = random.randint(0, num_pairs - 1) self.image_sentence_pairs.append( self.image_sentence_pairs[choice]) assert len(self.image_sentence_pairs) % batch_num_streams == 0 if shuffle: random.shuffle(self.image_sentence_pairs) self.pad = pad self.truncate = truncate self.negative_one_padded_streams = frozenset( ('input_sentence', 'target_sentence'))
def __init__(self, experiment_paths, experiment_config, dataset, vocab=None, include_all_boxes=False): SequenceGenerator.__init__(self) self.exp_name = experiment_config.exp_name self.batch_num_streams = experiment_config.train.batch_size self.max_words = experiment_config.train.max_words self.pad = experiment_config.pad if hasattr(experiment_config, 'pad') else True self.truncate = experiment_config.truncate if hasattr( experiment_config, 'truncate') else True self.swap_axis_streams = frozenset( ('timestep_input', 'timestep_cont', 'timestep_target')) self.index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.dataset = dataset self.image_refexp_pairs = self.dataset.image_refexp_pairs # Load image features if self.dataset.image_features is None: features_filename = "%s/COCO_region_features.h5" % experiment_paths.precomputed_image_features self.dataset.extract_image_object_features( features_filename, feature_layer='fc7', include_all_boxes=include_all_boxes) # Load vocab if vocab is not None: self.vocabulary_inverted = vocab self.vocabulary = {} for index, word in enumerate(self.vocabulary_inverted): self.vocabulary[word] = index else: self.init_vocabulary() # make the number of image/refexp pairs a multiple of the buffer size # so each timestep of each batch is useful and we can align the images align = experiment_config.aligned if hasattr(experiment_config, 'aligned') else True if align: num_pairs = len(self.image_refexp_pairs) remainder = num_pairs % self.batch_num_streams if remainder > 0: num_needed = self.batch_num_streams - remainder for i in range(num_needed): choice = random.randint(0, num_pairs - 1) self.image_refexp_pairs.append( self.image_refexp_pairs[choice]) assert len(self.image_refexp_pairs) % self.batch_num_streams == 0 shuffle = experiment_config.shuffle if hasattr(experiment_config, 'shuffle') else True if shuffle: random.shuffle(self.image_refexp_pairs)
def __init__(self, coco, batch_num_streams, vocab=None, max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True, pad=True, truncate=True, split_ids=None): self.max_words = max_words num_empty_lines = 0 self.images = [] num_total = 0 num_missing = 0 num_captions = 0 known_images = {} image_root = '%s/%s' % (COCO_PATH, coco.image_folder) if split_ids is None: split_ids = coco.images.keys() for image_id in split_ids: image_info = coco.images[image_id] image_path = '%s/%s/%s' % \ (image_root, image_info['file_path'], image_info['file_name']) if os.path.isfile(image_path): assert image_id not in known_images # no duplicates allowed known_images[image_id] = {} known_images[image_id]['path'] = image_path if gt_captions: known_images[image_id]['sentences'] = [split_sentence(anno['sentence']) for anno in coco.image_to_annotations[image_id]] num_captions += len(known_images[image_id]['sentences']) else: known_images[image_id]['sentences'] = [] else: num_missing += 1 print 'Warning (#%d): image not found: %s' % (num_missing, image_path) num_total += 1 print '%d/%d images missing' % (num_missing, num_total) if vocab is None: self.init_vocabulary(known_images) else: self.vocabulary_inverted = vocab self.vocabulary = {} for index, word in enumerate(self.vocabulary_inverted): self.vocabulary[word] = index self.image_sentence_pairs = [] num_no_sentences = 0 for image_filename, metadata in known_images.iteritems(): if not metadata['sentences']: num_no_sentences += 1 print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename) for sentence in metadata['sentences']: self.image_sentence_pairs.append((metadata['path'], sentence)) self.index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.image_list = [] SequenceGenerator.__init__(self) self.batch_num_streams = batch_num_streams # make the number of image/sentence pairs a multiple of the buffer size # so each timestep of each batch is useful and we can align the images if align: num_pairs = len(self.image_sentence_pairs) remainder = num_pairs % batch_num_streams if remainder > 0: num_needed = batch_num_streams - remainder for i in range(num_needed): choice = random.randint(0, num_pairs - 1) self.image_sentence_pairs.append(self.image_sentence_pairs[choice]) assert len(self.image_sentence_pairs) % batch_num_streams == 0 if shuffle: random.shuffle(self.image_sentence_pairs) self.pad = pad self.truncate = truncate self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))
def __init__(self, filenames, dset, batch_num_streams=1, max_frames=MAX_FRAMES, align=True, shuffle=True, pad=True, truncate=True): self.max_frames = max_frames self.lines = [] num_empty_lines = 0 self.vid_poolfeats = {} # listofdict [{}] for poolfeatfile, sentfile in filenames: print 'Reading pooled features from file: %s' % poolfeatfile if dset == 'train': vid_num = 1 elif dset == 'val': vid_num = 1201 elif dset == 'test': vid_num = 1301 else: raise Exception('Unknown video data split name: %s' % dset) with open(poolfeatfile, 'rb') as poolfd: # each line has the fc7 mean of 1 video for line in poolfd: line = line.strip() video_id = 'vid%d' % vid_num if video_id not in self.vid_poolfeats: self.vid_poolfeats[video_id] = [] self.vid_poolfeats[video_id].append(line) vid_num += 1 # reset max_words based on maximum frames in the video print 'Reading sentences in: %s' % sentfile with open(sentfile, 'r') as sentfd: for line in sentfd: line = line.strip() id_sent = line.split('\t') if len(id_sent) < 2: num_empty_lines += 1 continue self.lines.append((id_sent[0], id_sent[1])) if num_empty_lines > 0: print 'Warning: ignoring %d empty lines.' % num_empty_lines self.line_index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.frame_list = [] SequenceGenerator.__init__(self) self.batch_num_streams = batch_num_streams # needed in hdf5 to seq # make the number of image/sentence pairs a multiple of the buffer size # so each timestep of each batch is useful and we can align the images if align: num_pairs = len(self.lines) remainder = num_pairs % BUFFER_SIZE if remainder > 0: num_needed = BUFFER_SIZE - remainder for i in range(num_needed): choice = random.randint(0, num_pairs - 1) self.lines.append(self.lines[choice]) assert len(self.lines) % BUFFER_SIZE == 0 if shuffle: random.shuffle(self.lines) self.pad = pad self.truncate = truncate
def __init__(self, coco,split_name,batch_num_streams, image_root, vocab=None, max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True, pad=True, truncate=True, split_ids=None): #split_ids: image list e.g. '2801146217_03a0b59ccb.jpg\n', '1321723162_9d4c78b8af.jpg\n' # # self.max_words = max_words num_empty_lines = 0 self.images = [] num_total = 0 num_missing = 0 num_captions = 0 known_images = {} #self.coco = coco self.image_path_to_id = {} self.image_sentence_pairs = [] ### ## generate image_id list, which will be used in retrieval experiments ### if split_ids is None: split_ids = coco.imgs.keys() self.image_path_to_id = {} # pdb.set_trace() # print split_ids for image_id in split_ids: image_path = '%s/%s.jpg' % (image_root, image_id) # print 'image_info ',image_info #print 'image_path ',image_path self.image_path_to_id[image_path] = image_id #print self.image_path_to_id ###'./data/flickr8K/images/train/143688895_e837c3bc76.jpg': '143688895_e837c3bc76', self.image_sentence_pairs=split_image_captions(split_name,image_root) #print image_sentence_pairs #generate word vocabulary based on image caption sentences if vocab is None: self.init_vocabulary(self.image_sentence_pairs) else: self.vocabulary_inverted = vocab self.vocabulary = {} for index, word in enumerate(self.vocabulary_inverted): self.vocabulary[word] = index self.index = 0 self.num_resets = 0 self.num_truncates = 0 self.num_pads = 0 self.num_outs = 0 self.image_list = [] SequenceGenerator.__init__(self) self.batch_num_streams = batch_num_streams # make the number of image/sentence pairs a multiple of the buffer size # so each timestep of each batch is useful and we can align the images if align: num_pairs = len(self.image_sentence_pairs) #pdb.set_trace() print 'number of pairs: ', num_pairs remainder = num_pairs % batch_num_streams if remainder > 0: num_needed = batch_num_streams - remainder for i in range(num_needed): choice = random.randint(0, num_pairs - 1) self.image_sentence_pairs.append(self.image_sentence_pairs[choice]) assert len(self.image_sentence_pairs) % batch_num_streams == 0 if shuffle: random.shuffle(self.image_sentence_pairs) self.pad = pad self.truncate = truncate self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))