예제 #1
0
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.build(coco.all_captions())
    vocabulary.save(config.vocabulary_file)
    return vocabulary
예제 #2
0
def build_vocabulary(config, max_ann_num=None):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file, config.max_train_ann_num)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    if not config.max_train_ann_num:
        vocabulary.build(coco.all_captions())
    else:
        vocabulary.build((coco.all_captions())[:config.max_train_ann_num])
    vocabulary.save(config.vocabulary_file)
    return vocabulary
예제 #3
0
def build_vocabulary(config, captions, oracle_file):
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if True:  #not os.path.exists(config.vocabulary_file):
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    #return vocabulary

    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        sent_lens = []
        for caption in captions:
            current_word_idxs, current_length = vocabulary.process_sentence(
                caption)
            current_num_words = min(config.max_caption_length - 2,
                                    current_length)

            pad_length = config.max_caption_length - current_length - 2
            current_word_idxs = [config._START_
                                 ] + current_word_idxs[:current_num_words] + [
                                     config._END_
                                 ] + [config._PAD_] * pad_length

            word_idxs.append(current_word_idxs)
            sent_lens.append(current_num_words + 2)
        word_idxs = np.array(word_idxs)
        data = {'word_idxs': word_idxs, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']

    if oracle_file is not None:
        with open(oracle_file, 'w') as outfile:
            paras = ""
            for line in word_idxs:
                for word in line:
                    paras += (str(word) + ' ')
                paras += '\n'
                outfile.write(paras)

    return vocabulary
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    if not os.path.exists(config.prepare_annotation_dir):
        os.mkdir(config.prepare_annotation_dir)
    coco = COCO(config, config.train_caption_file, config.val_caption_file)
    
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        coco.filter_by_cap_len(config.max_caption_length)
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
        vocabulary.save_counts(config.word_count_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    
    print("Processing the captions...")
    if not os.path.exists(config.train_csv_file):
                    
        coco.filter_by_words(set(vocabulary.words))
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [ 
            os.path.join(config.dataset_image_dir,
            'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val',
            coco.imgs[image_id]['file_name'])
                        for image_id in image_ids ] 
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.train_csv_file)
    else:
        annotations = pd.read_csv(config.train_csv_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values
예제 #5
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding="latin1").item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
예제 #6
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding='latin1').item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    if (config.train_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.train_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.train_data_count_limit]
        image_files = image_files[0:config.train_data_count_limit]
        word_idxs = word_idxs[0:config.train_data_count_limit]
        masks = masks[0:config.train_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'train_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.train_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")

    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
예제 #7
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file, config.ignore_file)
    #coco.filter_by_cap_len(config.max_caption_length)

    #print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    #print("Number of words = %d" %(vocabulary.size))

    #coco.filter_by_words(set(vocabulary.words))

    #print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        
        captions = [] 
        image_ids = [] 
        image_files = [] 

        for id, file, feat, cap in annotations.values:
            
            image_ids.append(id)
            image_files.append(feat)
            captions.append(cap)
        
    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        sent_lens = []
        for caption in tqdm(captions):
            current_word_idxs, current_length = vocabulary.process_sentence(caption)
            current_num_words = min(config.max_caption_length-2, current_length)

            current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_]
            pad_length = config.max_caption_length - current_num_words -2
            if pad_length > 0:
                current_word_idxs += [config._PAD_] * (pad_length)
            #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length))
            current_masks = np.zeros(config.max_caption_length)
            current_masks[:current_num_words] = 1.0

            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
            sent_lens.append(current_num_words+2)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = None #data['masks']
        sent_lens = data['sentence_len']
    #print("Captions processed.")
    #print("Number of captions = %d" %(len(captions)))
    #print("Number of word_idxs = %d" %(len(word_idxs)))
    #print("Number of sent_lens = %d" %(len(sent_lens)))
    dataset = DataSet(coco,
                      vocabulary,
                      image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      sent_lens,
                      True,
                      True)
    return dataset
예제 #8
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)
    if config.distributed:
        images = os.listdir(config.train_image_dir)
        ids = [int(x[15:27]) for x in images]
        print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str(
            int(len(ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    if config.distributed:
        print('Filter captions by images')
        coco.filter_by_images(ids)
        #print(coco.getImgIds(ids))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.set_index('image_id', inplace=True)
        annotations = annotations.loc[ids]
        if not config.distributed:
            annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        if not config.distributed:
            np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files))
    #print "Images IDs to be used on this server: " + str(image_ids)
    return dataset
예제 #9
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % vocabulary.size)

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        # RV
        # save np.load
        # np_load_old = np.load

        # modify the default parameters of np.load
        # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
        #
        data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item()
        #
        # restore np.load for future normal usage
        # np.load = np_load_old
        # RV
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    # RV
    # Select the first 30000 captions from the shuffled set
    # num_examples = 5000
    # word_idxs = word_idxs[:num_examples]
    # image_files = image_files[:num_examples]

    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      True,
                      True)
    print("Dataset built.")
    return dataset
예제 #10
0
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    vocabulary = Vocabulary(config.vocabulary_size)

    print("Vocabulary complete.")
    print("Number of words = %d" % (vocabulary.size))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        coco.filter_by_words(set(vocabulary.words))
        print("Filtering the captions to those that exist")
        captions = []
        image_ids = []
        id = 0
        for ann_id in coco.anns:
            id = id + 1
            if id < 1000:
                entry = coco.anns[ann_id]
                should_add = entry['image_id'] in coco.imgs and os.path.exists(
                    os.path.join(config.train_image_dir,
                                 coco.imgs[entry['image_id']]['file_name']))
                if should_add:
                    captions.append(entry['caption'])
                    image_ids.append(entry['image_id'])

        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        print("Loading the captions from ", config.temp_annotation_file)
        annotations = pd.read_csv(config.temp_annotation_file,
                                  encoding='latin-1')
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.vocabulary_file):
        print("Building the vocabulary...")
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        print("Loading the vocabulary from ", config.vocabulary_file)
        vocabulary.load(config.vocabulary_file)

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        loaded = np.load(config.temp_data_file, allow_pickle=True)
        data = loaded.item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset