예제 #1
0
    def _process_input_sind_lmdb(self, data_dir, split='train'):

        # Some lmdb configuration.
        lmdb_dir_fc = os.path.join(self.data_dir, split,
                                   'fea_vgg16_fc7_lmdb_lmdb')
        lmdb_dir_conv = os.path.join(self.data_dir, split,
                                     'imgs_resized_vgg16_conv5_3_lmdb_lmdb')

        lmdb_env_fc = lmdb.open(lmdb_dir_fc, readonly=True)
        lmdb_env_conv = lmdb.open(lmdb_dir_conv, readonly=True)

        split_dir = os.path.join(data_dir, split)
        anno_fn = os.path.join(split_dir, 'annotions_filtered.txt')

        # Now load the stories.
        dict_story = {}
        with open(anno_fn, 'r') as fid:
            for aline in fid:
                parts = aline.strip().split()
                flickr_id = parts[0]
                sid = int(parts[2])
                slid = int(parts[3])
                if sid not in dict_story:
                    dict_story[sid] = {}
                dict_story[sid][slid] = []
                dict_story[sid][slid].append(flickr_id)
                inp_v = []
                #inp_v = [ utils.process_word2(word = w,
                #                        word2vec = self.word2vec,
                #                        vocab = self.vocab,
                #                        word_vector_size = self.word_vector_size,
                #                        to_return = 'word2vec') for w in parts[4:] ]

                inp_y = [
                    utils.process_word2(word=w,
                                        word2vec=self.word2vec,
                                        vocab=self.vocab,
                                        word_vector_size=self.word_vector_size,
                                        to_return='index',
                                        silent=True) for w in parts[4:]
                ]

                dict_story[sid][slid].append(inp_v)
                dict_story[sid][slid].append(inp_y)

        # Just in case, we sort all the stories in line.
        for sid in dict_story:
            story = dict_story[sid].items()
            sorted(story, key=lambda x: x[0])
            story = story[::-1]
            dict_story[sid] = story

        return dict_story, lmdb_env_fc, lmdb_env_conv
    def _process_batch_sind(self, batch_index, split='train'):
        # Now, randomly select one story.

        #logging.info('before batch ...')

        start_index = self.batch_size * batch_index

        split_story = None
        if split == 'train':
            split_lmdb_env_fc = self.train_lmdb_env_fc
            split_lmdb_env_conv = self.train_lmdb_env_conv
            split_story = self.train_story
            split_dict_story = self.train_dict_story
        else:
            split_lmdb_env_fc = self.test_lmdb_env_fc
            split_lmdb_env_conv = self.test_lmdb_env_conv
            split_story = self.test_story
            split_dict_story = self.test_dict_story

        # make sure it's small than the number of stories.
        start_index = start_index % len(split_story)
        # make sure there is enough for a batch.
        start_index = min(start_index, len(split_story) - self.batch_size)
        # Now, we select the stories.
        stories = split_story[start_index:start_index + self.batch_size]
        #    slids.append( random.choice(range(len(split_dict_story[sid]))))

        max_inp_len = 0
        max_q_len = 1  # just be 1.
        max_ans_len = 0
        for sid in stories:
            max_inp_len = max(max_inp_len, len(split_dict_story[sid]) - 1)
            #max_q_len = max(max_q_len, split_dict_story[sid][slid][1])
            for slid in split_dict_story[sid]:
                max_ans_len = max(max_ans_len, len(slid[-1][-1]))

        max_ans_len += 1  # this is for the start token.
        # in our case, it is pretty similar to the word-level dmn,

        questions = []
        # batch x story_len x fea
        inputs = []
        answers = []
        answers_inp = []
        answers_mask = []
        max_key_len = 12

        with split_lmdb_env_fc.begin() as txn_fc:
            with split_lmdb_env_conv.begin() as txn_conv:
                for sid in stories:
                    inp = []  # story_len x patches x fea.
                    anno = split_dict_story[sid]
                    question = []
                    answer = []
                    answer_mask = []
                    answer_inp = []

                    for slid in split_dict_story[sid]:
                        input_anno = slid
                        img_id = input_anno[1][0]
                        while len(img_id) < max_key_len:
                            img_id = '0' + img_id

                        fc_raw = txn_fc.get(img_id.encode('ascii'))
                        fc_fea = caffe.proto.caffe_pb2.Datum()
                        fc_fea.ParseFromString(fc_raw)
                        question.append(
                            np.fromstring(fc_fea.data, dtype=np.float32))
                        # Now, it is the inputs, we can use the other images other than current one.
                        conv_raw = txn_conv.get(img_id.encode('ascii'))
                        conv_datum = caffe.proto.caffe_pb2.Datum()
                        conv_datum.ParseFromString(conv_raw)
                        conv_fea = np.fromstring(conv_datum.data,
                                                 dtype=np.float32)
                        x = conv_fea.reshape(conv_datum.channels,
                                             conv_datum.height,
                                             conv_datum.width)  # 512 x 14 x 14
                        x = x.reshape(conv_datum.channels,
                                      conv_datum.height * conv_datum.width)
                        x = x.swapaxes(0, 1)
                        inp.append(x)
                        #now for answer.

                        a = []
                        a.append(self.vocab_size)  # start token.
                        a.extend(input_anno[1]
                                 [2])  # this is the index for the captions.
                        a_inp = np.zeros((max_ans_len, self.word_vector_size),
                                         dtype=floatX)
                        a_mask = []
                        # add the start token firstly
                        a_inp[0, :] = utils.process_word2(
                            word="#START#",
                            word2vec=self.word2vec,
                            vocab=self.vocab,
                            word_vector_size=self.word_vector_size,
                            to_return='word2vec')

                        for ans_idx, w_idx in enumerate(a[1:]):
                            a_inp[ans_idx + 1, :] = utils.process_word2(
                                word=self.ivocab[w_idx],
                                word2vec=self.word2vec,
                                vocab=self.vocab,
                                word_vector_size=self.word_vector_size,
                                to_return='word2vec')

                        a_mask = [1 for i in range(len(a) - 1)]
                        while len(a) < max_ans_len:  # this does not matter.
                            a.append(-1)
                            a_mask.append(0)

                        a = a[1:]
                        answer.append(np.array(a).astype(np.int32))
                        answer_mask.append(np.array(a_mask).astype(np.int32))
                        answer_inp.append(a_inp)

                    question = np.stack(question, axis=0)
                    questions.append(question)
                    inp = np.stack(inp, axis=0)  # #story_len x patches x fea
                    inputs.append(inp)
                    answer = np.stack(answer,
                                      axis=0)  # story_len x max_answer_len
                    answers.append(answer)
                    answer_mask = np.stack(
                        answer_mask, axis=0)  # story_len x max_answer_len -1
                    answers_mask.append(answer_mask)
                    answer_inp = np.stack(answer_inp,
                                          axis=0)  # story_len x max_answer_len
                    answers_inp.append(answer_inp)

        # Finally, we transform them into numpy array.
        inputs = np.stack(inputs, axis=0)
        inputs = np.array(inputs).astype(floatX)
        #questions = np.array(questions).astype(floatX)
        questions = np.stack(questions, axis=0)
        questions = np.array(questions).astype(floatX)
        answers = np.array(answers).astype(np.int32)
        answers_mask = np.array(answers_mask).astype(floatX)
        #print answers_mask
        answers_inp = np.stack(answers_inp, axis=0)
        questions = np.reshape(
            questions,
            (questions.shape[0] * questions.shape[1], questions.shape[2]))
        answers = np.reshape(
            answers, (answers.shape[0] * answers.shape[1], answers.shape[2]))
        answers_inp = np.reshape(answers_inp,
                                 (answers_inp.shape[0] * answers_inp.shape[1],
                                  answers_inp.shape[2], answers_inp.shape[3]))
        answers_mask = np.reshape(
            answers_mask, (answers_mask.shape[0] * answers_mask.shape[1],
                           answers_mask.shape[2]))

        #print inputs.shape
        #print questions.shape
        #print answers.shape
        #print answers_inp.shape
        #print answers_mask.shape

        #logging.info('after batch ...')
        return inputs, questions, answers, answers_inp, answers_mask
예제 #3
0
    def _process_input_sind(self, data_dir, split = 'train'):

        split_dir = os.path.join(data_dir, split)
        fea_dir = os.path.join(split_dir, 'fea_vgg16_fc7')
        anno_fn = os.path.join(split_dir,'annotions_filtered_fixed.txt')
        # Now load the stories.
        dict_story = {}
        with open(anno_fn ,'r') as fid:
            for aline in fid:
                parts = aline.strip().split()
                flickr_id = parts[0]
                sid = int(parts[2])
                slid = int(parts[3])
                if sid not in dict_story:
                    dict_story[sid] = {}
                dict_story[sid][slid] = []
                dict_story[sid][slid].append(flickr_id)
                inp_v = []
                #inp_v = [ utils.process_word2(word = w,
                #                        word2vec = self.word2vec,
                #                        vocab = self.vocab,
                #                        word_vector_size = self.word_vector_size,
                #                        to_return = 'word2vec') for w in parts[4:] ]

                inp_y = [ utils.process_word2(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = 'index', silent=True) for w in parts[4:] ]


                dict_story[sid][slid].append( inp_v )
                dict_story[sid][slid].append( inp_y )
        # Just in case, we sort all the stories in line.
        t_keys = dict_story.keys()
        for sid in t_keys:
            story = dict_story[sid].items()
            sorted(story, key = lambda x: x[0])
            dict_story[sid] = story

        # Load all features into memory.
        features = None
        num_imgs = 0
        fns_dict = {}

        total_fea = 0
        total_fns = 0

        for root, dirs, fns in os.walk(fea_dir, followlinks = True):
            for fn in fns:
                full_fn = os.path.join(root, fn)
                hdf_f = h5py.File(full_fn,'r')
                fea = hdf_f['fea'][:]
                fns = hdf_f['fns'][:]
                total_fea += fea.shape[0]
                total_fns += fns.shape[0]
                assert fea.shape[0] == fns.shape[0], "Should not happen, we have re-runed the feature extraction."
                hdf_f.close()

        logging.info('total fea = %d, fns = %d', total_fea, total_fns)
        for root, dirs, fns in os.walk(fea_dir, followlinks=True):
            for fn in fns:
                full_fn = os.path.join(root, fn)
                hdf_f = h5py.File(full_fn,'r')
                fea = hdf_f['fea'][:]
                fns = hdf_f['fns'][:]
                hdf_f.close()

                if features is None:
                    shape = [total_fea]
                    self.cnn_dim = fea.size / fea.shape[0]
                    shape.extend(fea.shape[1:])
                    features = np.zeros(shape)
                    features[0:fea.shape[0],:] = fea
                else:
                    features[num_imgs:num_imgs+fea.shape[0],:] = fea
                for i in range(fns.shape[0]):
                    bfn = os.path.basename(fns[i])
                    key = os.path.splitext(bfn)[0]
                    key = key.split('_')[0]
                    fns_dict[key] = num_imgs
                    num_imgs += 1

        logging.info("Done loading features from %s", fea_dir)

        return dict_story, features, fns_dict, num_imgs