Python Vocabulary.save примеры, utils.vocabulary.Vocabulary.save Python примеры использования

Пример #1

0

Показать файл

Файл: dataset.py Проект: noandrea/image_captioning

def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.build(coco.all_captions())
    vocabulary.save(config.vocabulary_file)
    return vocabulary

Пример #2

0

Показать файл

Файл: coco_dataset.py Проект: samarthbhargav/pytorch-vision-language

 def get_vocabulary(cls, vocab_path, captions_path, tokenized_captions, threshold=1):
     # Load or construct vocabulary
     if os.path.exists(vocab_path):
         vocab = Vocabulary.load(vocab_path)
     else:
         vocab = cls.build_vocab(captions_path, tokenized_captions, threshold)
         #TODO: check if saving is safe
         Vocabulary.save(vocab, vocab_path)
         print("Saved the vocabulary to '%s'" %vocab_path)
     return vocab

Пример #3

0

Показать файл

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    vqa = VQA(config.train_answer_file, config.train_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    print("Reading the questions and answers...")
    annotations = process_vqa(vqa, 'COCO_train2014', config.train_image_dir,
                              config.temp_train_annotation_file)

    image_files = annotations['image_file'].values
    questions = annotations['question'].values
    question_ids = annotations['question_id'].values
    answers = annotations['answer'].values
    print("Questions and answers read.")
    print("Number of questions = %d" % (len(question_ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary()
    if not os.path.exists(config.vocabulary_file):
        for question in tqdm(questions):
            vocabulary.add_words(word_tokenize(question))
        for answer in tqdm(answers):
            vocabulary.add_words(word_tokenize(answer))
        vocabulary.compute_frequency()
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    config.vocabulary_size = vocabulary.size

    print("Processing the questions and answers...")
    if not os.path.exists(config.temp_train_data_file):
        question_word_idxs, question_lens = process_questions(
            questions, vocabulary, config)
        answer_idxs = process_answers(answers, vocabulary)
        data = {
            'question_word_idxs': question_word_idxs,
            'question_lens': question_lens,
            'answer_idxs': answer_idxs
        }
        np.save(config.temp_train_data_file, data)
    else:
        data = np.load(config.temp_train_data_file).item()
        question_word_idxs = data['question_word_idxs']
        question_lens = data['question_lens']
        answer_idxs = data['answer_idxs']
    print("Questions and answers processed.")

    print("Building the dataset...")
    dataset = DataSet(image_files, question_word_idxs, question_lens,
                      question_ids, config.batch_size, answer_idxs, True, True)
    print("Dataset built.")
    return dataset, config

Пример #4

0

Показать файл

def build_vocabulary(config, max_ann_num=None):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file, config.max_train_ann_num)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    if not config.max_train_ann_num:
        vocabulary.build(coco.all_captions())
    else:
        vocabulary.build((coco.all_captions())[:config.max_train_ann_num])
    vocabulary.save(config.vocabulary_file)
    return vocabulary

Пример #5

0

Показать файл

Файл: text_process.py Проект: audreycui/Texygen

def build_vocabulary(config, captions, oracle_file):
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if True:  #not os.path.exists(config.vocabulary_file):
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    #return vocabulary

    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        sent_lens = []
        for caption in captions:
            current_word_idxs, current_length = vocabulary.process_sentence(
                caption)
            current_num_words = min(config.max_caption_length - 2,
                                    current_length)

            pad_length = config.max_caption_length - current_length - 2
            current_word_idxs = [config._START_
                                 ] + current_word_idxs[:current_num_words] + [
                                     config._END_
                                 ] + [config._PAD_] * pad_length

            word_idxs.append(current_word_idxs)
            sent_lens.append(current_num_words + 2)
        word_idxs = np.array(word_idxs)
        data = {'word_idxs': word_idxs, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']

    if oracle_file is not None:
        with open(oracle_file, 'w') as outfile:
            paras = ""
            for line in word_idxs:
                for word in line:
                    paras += (str(word) + ' ')
                paras += '\n'
                outfile.write(paras)

    return vocabulary

Пример #6

0

Показать файл

Файл: estimate_vocab.py Проект: ryanai3/TooMuchData

def main():
    word2cnt = Vocabulary()
    args = get_args()
    i = -1
    for line in open(args.input_files.strip(), 'r').readlines():
        #  for line in fileinput.input():
        i += 1
        if i % 100 == 0:
            print(i)
            if i % 1000 == 0:
                word2cnt.save(args.output_prefix.strip() + "_partial")
        with open(line.rstrip(), 'r') as f:
            for line in f.readlines():
                for word in line.strip().split():
                    word2cnt.observe_word(word)
    word2cnt.save(args.output_prefix.strip())

Пример #7

0

Показать файл

def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    vqa = VQA(config.train_answer_file, config.train_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    question_ids = list(vqa.qa.keys())
    questions = [vqa.qqa[k]['question'] for k in question_ids]
    answers = [vqa.qa[k]['best_answer'] for k in question_ids]

    vocabulary = Vocabulary()
    for question in tqdm(questions):
        vocabulary.add_words(word_tokenize(question))
    for answer in tqdm(answers):
        vocabulary.add_words(word_tokenize(answer))
    vocabulary.compute_frequency()
    vocabulary.save(config.vocabulary_file)
    return vocabulary

Пример #8

0

Показать файл

Файл: prepare.py Проект: Wanger-SJTU/tensorflow_know_when_to

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    if not os.path.exists(config.prepare_annotation_dir):
        os.mkdir(config.prepare_annotation_dir)
    coco = COCO(config, config.train_caption_file, config.val_caption_file)
    
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        coco.filter_by_cap_len(config.max_caption_length)
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
        vocabulary.save_counts(config.word_count_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    
    print("Processing the captions...")
    if not os.path.exists(config.train_csv_file):
                    
        coco.filter_by_words(set(vocabulary.words))
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [ 
            os.path.join(config.dataset_image_dir,
            'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val',
            coco.imgs[image_id]['file_name'])
                        for image_id in image_ids ] 
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.train_csv_file)
    else:
        annotations = pd.read_csv(config.train_csv_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

Пример #9

0

Показать файл

Файл: dataset.py Проект: noandrea/image_captioning

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding="latin1").item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset

Пример #10

0

Показать файл

Файл: dataset.py Проект: vishwakarmarhl/image_captioning

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding='latin1').item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    if (config.train_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.train_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.train_data_count_limit]
        image_files = image_files[0:config.train_data_count_limit]
        word_idxs = word_idxs[0:config.train_data_count_limit]
        masks = masks[0:config.train_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'train_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.train_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")

    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset

Пример #11

0

Показать файл

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file, config.ignore_file)
    #coco.filter_by_cap_len(config.max_caption_length)

    #print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    #print("Number of words = %d" %(vocabulary.size))

    #coco.filter_by_words(set(vocabulary.words))

    #print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        
        captions = [] 
        image_ids = [] 
        image_files = [] 

        for id, file, feat, cap in annotations.values:
            
            image_ids.append(id)
            image_files.append(feat)
            captions.append(cap)
        
    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        sent_lens = []
        for caption in tqdm(captions):
            current_word_idxs, current_length = vocabulary.process_sentence(caption)
            current_num_words = min(config.max_caption_length-2, current_length)

            current_word_idxs = [config._START_] + current_word_idxs[:current_num_words] + [config._END_]
            pad_length = config.max_caption_length - current_num_words -2
            if pad_length > 0:
                current_word_idxs += [config._PAD_] * (pad_length)
            #print("sent length:"+str(len(current_word_idxs))+", real len:"+str(current_length))
            current_masks = np.zeros(config.max_caption_length)
            current_masks[:current_num_words] = 1.0

            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
            sent_lens.append(current_num_words+2)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = None #data['masks']
        sent_lens = data['sentence_len']
    #print("Captions processed.")
    #print("Number of captions = %d" %(len(captions)))
    #print("Number of word_idxs = %d" %(len(word_idxs)))
    #print("Number of sent_lens = %d" %(len(sent_lens)))
    dataset = DataSet(coco,
                      vocabulary,
                      image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      sent_lens,
                      True,
                      True)
    return dataset

Пример #12

0

Показать файл

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)
    if config.distributed:
        images = os.listdir(config.train_image_dir)
        ids = [int(x[15:27]) for x in images]
        print 'Input Path: ' + config.train_image_dir + ' Number of files in input path: ' + str(
            int(len(ids)))

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    if config.distributed:
        print('Filter captions by images')
        coco.filter_by_images(ids)
        #print(coco.getImgIds(ids))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.set_index('image_id', inplace=True)
        annotations = annotations.loc[ids]
        if not config.distributed:
            annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        if not config.distributed:
            np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    #print "Input Path: " + config.train_image_dir + " Number of files after data preparation: " + str(len(image_files))
    #print "Images IDs to be used on this server: " + str(image_ids)
    return dataset

Пример #13

0

Показать файл

Файл: dataset.py Проект: spsagar13/deep_learning.cv.and.nlp

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % vocabulary.size)

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        # RV
        # save np.load
        # np_load_old = np.load

        # modify the default parameters of np.load
        # np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
        #
        data = np.load(config.temp_data_file, allow_pickle=True, encoding="latin1").item()
        #
        # restore np.load for future normal usage
        # np.load = np_load_old
        # RV
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    # RV
    # Select the first 30000 captions from the shuffled set
    # num_examples = 5000
    # word_idxs = word_idxs[:num_examples]
    # image_files = image_files[:num_examples]

    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      True,
                      True)
    print("Dataset built.")
    return dataset

Пример #14

0

Показать файл

Файл: dataset.py Проект: car4p17/Show_and_Tell_Social

def prepare_train_data(config):
    """ Prepare the data for training the model. """
    vocabulary = Vocabulary(config.vocabulary_size)

    print("Vocabulary complete.")
    print("Number of words = %d" % (vocabulary.size))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        coco.filter_by_words(set(vocabulary.words))
        print("Filtering the captions to those that exist")
        captions = []
        image_ids = []
        id = 0
        for ann_id in coco.anns:
            id = id + 1
            if id < 1000:
                entry = coco.anns[ann_id]
                should_add = entry['image_id'] in coco.imgs and os.path.exists(
                    os.path.join(config.train_image_dir,
                                 coco.imgs[entry['image_id']]['file_name']))
                if should_add:
                    captions.append(entry['caption'])
                    image_ids.append(entry['image_id'])

        image_files = [
            os.path.join(config.train_image_dir,
                         coco.imgs[image_id]['file_name'])
            for image_id in image_ids
        ]
        annotations = pd.DataFrame({
            'image_id': image_ids,
            'image_file': image_files,
            'caption': captions
        })
        annotations.to_csv(config.temp_annotation_file)
    else:
        print("Loading the captions from ", config.temp_annotation_file)
        annotations = pd.read_csv(config.temp_annotation_file,
                                  encoding='latin-1')
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.vocabulary_file):
        print("Building the vocabulary...")
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        print("Loading the vocabulary from ", config.vocabulary_file)
        vocabulary.load(config.vocabulary_file)

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        loaded = np.load(config.temp_data_file, allow_pickle=True)
        data = loaded.item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset

Python Vocabulary.save примеры использования