def build_vocabulary(config): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) return vocabulary
def get_vocabulary(cls, vocab_path, captions_path, tokenized_captions, threshold=1): # Load or construct vocabulary if os.path.exists(vocab_path): vocab = Vocabulary.load(vocab_path) else: vocab = cls.build_vocab(captions_path, tokenized_captions, threshold) #TODO: check if saving is safe Vocabulary.save(vocab, vocab_path) print("Saved the vocabulary to '%s'" %vocab_path) return vocab
def prepare_train_data(config): """ Prepare the data for training the model. """ vqa = VQA(config.train_answer_file, config.train_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) print("Reading the questions and answers...") annotations = process_vqa(vqa, 'COCO_train2014', config.train_image_dir, config.temp_train_annotation_file) image_files = annotations['image_file'].values questions = annotations['question'].values question_ids = annotations['question_id'].values answers = annotations['answer'].values print("Questions and answers read.") print("Number of questions = %d" % (len(question_ids))) print("Building the vocabulary...") vocabulary = Vocabulary() if not os.path.exists(config.vocabulary_file): for question in tqdm(questions): vocabulary.add_words(word_tokenize(question)) for answer in tqdm(answers): vocabulary.add_words(word_tokenize(answer)) vocabulary.compute_frequency() vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) config.vocabulary_size = vocabulary.size print("Processing the questions and answers...") if not os.path.exists(config.temp_train_data_file): question_word_idxs, question_lens = process_questions( questions, vocabulary, config) answer_idxs = process_answers(answers, vocabulary) data = { 'question_word_idxs': question_word_idxs, 'question_lens': question_lens, 'answer_idxs': answer_idxs } np.save(config.temp_train_data_file, data) else: data = np.load(config.temp_train_data_file).item() question_word_idxs = data['question_word_idxs'] question_lens = data['question_lens'] answer_idxs = data['answer_idxs'] print("Questions and answers processed.") print("Building the dataset...") dataset = DataSet(image_files, question_word_idxs, question_lens, question_ids, config.batch_size, answer_idxs, True, True) print("Dataset built.") return dataset, config
def build_vocabulary(config, max_ann_num=None): """ Build the vocabulary from the training data and save it to a file. """ coco = COCO(config.train_caption_file, config.max_train_ann_num) coco.filter_by_cap_len(config.max_caption_length) vocabulary = Vocabulary(config.vocabulary_size) if not config.max_train_ann_num: vocabulary.build(coco.all_captions()) else: vocabulary.build((coco.all_captions())[:config.max_train_ann_num]) vocabulary.save(config.vocabulary_file) return vocabulary
def build_vocabulary(config, captions, oracle_file): print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if True: #not os.path.exists(config.vocabulary_file): vocabulary.build(captions) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) #return vocabulary print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] sent_lens = [] for caption in captions: current_word_idxs, current_length = vocabulary.process_sentence( caption) current_num_words = min(config.max_caption_length - 2, current_length) pad_length = config.max_caption_length - current_length - 2 current_word_idxs = [config._START_ ] + current_word_idxs[:current_num_words] + [ config._END_ ] + [config._PAD_] * pad_length word_idxs.append(current_word_idxs) sent_lens.append(current_num_words + 2) word_idxs = np.array(word_idxs) data = {'word_idxs': word_idxs, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] sent_lens = data['sentence_len'] if oracle_file is not None: with open(oracle_file, 'w') as outfile: paras = "" for line in word_idxs: for word in line: paras += (str(word) + ' ') paras += '\n' outfile.write(paras) return vocabulary
def main(): word2cnt = Vocabulary() args = get_args() i = -1 for line in open(args.input_files.strip(), 'r').readlines(): # for line in fileinput.input(): i += 1 if i % 100 == 0: print(i) if i % 1000 == 0: word2cnt.save(args.output_prefix.strip() + "_partial") with open(line.rstrip(), 'r') as f: for line in f.readlines(): for word in line.strip().split(): word2cnt.observe_word(word) word2cnt.save(args.output_prefix.strip())
def build_vocabulary(config): """ Build the vocabulary from the training data and save it to a file. """ vqa = VQA(config.train_answer_file, config.train_question_file) vqa.filter_by_ques_len(config.max_question_length) vqa.filter_by_ans_len(1) question_ids = list(vqa.qa.keys()) questions = [vqa.qqa[k]['question'] for k in question_ids] answers = [vqa.qa[k]['best_answer'] for k in question_ids] vocabulary = Vocabulary() for question in tqdm(questions): vocabulary.add_words(word_tokenize(question)) for answer in tqdm(answers): vocabulary.add_words(word_tokenize(answer)) vocabulary.compute_frequency() vocabulary.save(config.vocabulary_file) return vocabulary
def prepare_train_data(config): """ Prepare the data for training the model. """ if not os.path.exists(config.prepare_annotation_dir): os.mkdir(config.prepare_annotation_dir) coco = COCO(config, config.train_caption_file, config.val_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): coco.filter_by_cap_len(config.max_caption_length) vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) vocabulary.save_counts(config.word_count_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" %(vocabulary.size)) print("Processing the captions...") if not os.path.exists(config.train_csv_file): coco.filter_by_words(set(vocabulary.words)) captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.dataset_image_dir, 'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val', coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.train_csv_file) else: annotations = pd.read_csv(config.train_csv_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file, encoding="latin1").item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file, encoding='latin1').item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") if (config.train_data_count_limit > 0): print("-----------------------------------------------") print("Restricting Sz:\t", config.train_data_count_limit) print("Batch Sz:\t", config.batch_size) image_ids = image_ids[0:config.train_data_count_limit] image_files = image_files[0:config.train_data_count_limit] word_idxs = word_idxs[0:config.train_data_count_limit] masks = masks[0:config.train_data_count_limit] """ Dump the image paths to a file """ filepath = 'train_images.csv' with open(filepath, 'w') as file_handler: for i in range(0, config.train_data_count_limit): file_handler.write("{}\n".format(image_files[i])) #print(image_files) print("-----------------------------------------------") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file, config.ignore_file) #coco.filter_by_cap_len(config.max_caption_length) #print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) #print("Vocabulary built.") #print("Number of words = %d" %(vocabulary.size)) #coco.filter_by_words(set(vocabulary.words)) #print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = [] image_ids = [] image_files = [] for id, file, feat, cap in annotations.values: image_ids.append(id) image_files.append(feat) captions.append(cap) print("NUM CAPTIONS: " + str(len(captions))) if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] sent_lens = [] for caption in tqdm(captions): current_word_idxs, current_length = vocabulary.process_sentence(caption) current_num_words = min(config.max_caption_length-2, current_length) current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_] pad_length = config.max_caption_length - current_num_words -2 if pad_length > 0: current_word_idxs += [config._PAD_] * (pad_length) #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length)) current_masks = np.zeros(config.max_caption_length) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) sent_lens.append(current_num_words+2) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens} np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = None #data['masks'] sent_lens = data['sentence_len'] #print("Captions processed.") #print("Number of captions = %d" %(len(captions))) #print("Number of word_idxs = %d" %(len(word_idxs))) #print("Number of sent_lens = %d" %(len(sent_lens))) dataset = DataSet(coco, vocabulary, image_ids, image_files, config.batch_size, word_idxs, masks, sent_lens, True, True) return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) coco.filter_by_cap_len(config.max_caption_length) if config.distributed: images = os.listdir(config.train_image_dir) ids = [int(x[15:27]) for x in images] print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str( int(len(ids))) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % (vocabulary.size)) coco.filter_by_words(set(vocabulary.words)) if config.distributed: print('Filter captions by images') coco.filter_by_images(ids) #print(coco.getImgIds(ids)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.set_index('image_id', inplace=True) annotations = annotations.loc[ids] if not config.distributed: annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} if not config.distributed: np.save(config.temp_data_file, data) else: data = np.load(config.temp_data_file).item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files)) #print "Images IDs to be used on this server: " + str(image_ids) return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ coco = COCO(config.train_caption_file) print("Building the vocabulary...") vocabulary = Vocabulary(config.vocabulary_size) if not os.path.exists(config.vocabulary_file): vocabulary.build(coco.all_captions()) vocabulary.save(config.vocabulary_file) else: vocabulary.load(config.vocabulary_file) print("Vocabulary built.") print("Number of words = %d" % vocabulary.size) coco.filter_by_words(set(vocabulary.words)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns] image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns] image_files = [os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids] annotations = pd.DataFrame({'image_id': image_ids, 'image_file': image_files, 'caption': captions}) annotations.to_csv(config.temp_annotation_file) else: annotations = pd.read_csv(config.temp_annotation_file) captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array(current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: # RV # save np.load # np_load_old = np.load # modify the default parameters of np.load # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item() # # restore np.load for future normal usage # np.load = np_load_old # RV word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) # RV # Select the first 30000 captions from the shuffled set # num_examples = 5000 # word_idxs = word_idxs[:num_examples] # image_files = image_files[:num_examples] print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset
def prepare_train_data(config): """ Prepare the data for training the model. """ vocabulary = Vocabulary(config.vocabulary_size) print("Vocabulary complete.") print("Number of words = %d" % (vocabulary.size)) print("Processing the captions...") if not os.path.exists(config.temp_annotation_file): coco.filter_by_words(set(vocabulary.words)) print("Filtering the captions to those that exist") captions = [] image_ids = [] id = 0 for ann_id in coco.anns: id = id + 1 if id < 1000: entry = coco.anns[ann_id] should_add = entry['image_id'] in coco.imgs and os.path.exists( os.path.join(config.train_image_dir, coco.imgs[entry['image_id']]['file_name'])) if should_add: captions.append(entry['caption']) image_ids.append(entry['image_id']) image_files = [ os.path.join(config.train_image_dir, coco.imgs[image_id]['file_name']) for image_id in image_ids ] annotations = pd.DataFrame({ 'image_id': image_ids, 'image_file': image_files, 'caption': captions }) annotations.to_csv(config.temp_annotation_file) else: print("Loading the captions from ", config.temp_annotation_file) annotations = pd.read_csv(config.temp_annotation_file, encoding='latin-1') captions = annotations['caption'].values image_ids = annotations['image_id'].values image_files = annotations['image_file'].values if not os.path.exists(config.vocabulary_file): print("Building the vocabulary...") vocabulary.build(captions) vocabulary.save(config.vocabulary_file) else: print("Loading the vocabulary from ", config.vocabulary_file) vocabulary.load(config.vocabulary_file) if not os.path.exists(config.temp_data_file): word_idxs = [] masks = [] for caption in tqdm(captions): current_word_idxs_ = vocabulary.process_sentence(caption) current_num_words = len(current_word_idxs_) current_word_idxs = np.zeros(config.max_caption_length, dtype=np.int32) current_masks = np.zeros(config.max_caption_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs.append(current_word_idxs) masks.append(current_masks) word_idxs = np.array(word_idxs) masks = np.array(masks) data = {'word_idxs': word_idxs, 'masks': masks} np.save(config.temp_data_file, data) else: loaded = np.load(config.temp_data_file, allow_pickle=True) data = loaded.item() word_idxs = data['word_idxs'] masks = data['masks'] print("Captions processed.") print("Number of captions = %d" % (len(captions))) print("Building the dataset...") dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs, masks, True, True) print("Dataset built.") return dataset