예제 #1
0
 def __init__(self, config):
     self.config = config
     self.is_train = True if config.phase == 'train' else False
     self.train_cnn = self.is_train and config.train_cnn
     self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
     self.image_shape = [224, 224, 3]
     self.nn = NN(config)
     self.global_step = tf.Variable(0, name='global_step', trainable=False)
     self.build()
예제 #2
0
 def __init__(self, config):
     self.config = config
     self.is_train = True if config.phase == 'train' else False
     self.train_cnn = self.is_train and config.train_cnn
     self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
     self.image_shape = [224, 224, 3]
     self.nn = NN(config)
     self.global_step = tf.train.get_or_create_global_step()
     self.build()
예제 #3
0
 def __init__(self, config, batch_size, end_token=0):
     self.config = config
     self.batch_size = batch_size
     self.end_token = end_token
     self.image_batch = None
     self.feature_batch = None
     self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
     net = VGG19(weights='imagenet')
     self.trained_model = Model(input=net.input,
                                output=net.get_layer('fc2').output)
예제 #4
0
    def __init__(self, config, batch_size, seq_length):
        self.config = config
        self.batch_size = batch_size
        self.sentences = np.array([])
        self.labels = np.array([])
        self.seq_length = seq_length

        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        net = VGG19(weights='imagenet')
        self.trained_model = Model(input=net.input,
                                   output=net.get_layer('fc2').output)
예제 #5
0
 def __init__(self, config):
     self.config = config
     self.is_train = True if config.phase == 'train' else False
     self.train_cnn = self.is_train and config.train_cnn
     self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
     self.image_shape = [224, 224, 3]
     self.nn = NN(config)
     self.classes = cfg.CLASSES
     self.num_class = 80  #len(self.classes)
     self.num_anchor = cfg.NUM_ANCHOR
     self.anchors = cfg.ANCHORS
     self.image_size = cfg.IMAGE_SIZE
     self.cell_size = cfg.CELL_SIZE
     self.boxes_per_cell = cfg.BOXES_PER_CELL
     #self.output_size = (self.cell_size * self.cell_size) *\
     #(self.num_class + self.boxes_per_cell * 5)
     #7*7*(20+10)
     self.output_size = (self.num_class + 5) * self.num_anchor
     self.scale = 1.0 * self.image_size / self.cell_size
     self.boundary1 = self.cell_size * self.cell_size * self.num_class
     #7*7*20
     self.use_pretrain = True
     self.boundary2 = self.boundary1 +\
         self.cell_size * self.cell_size * self.boxes_per_cell
     #7*7*20 + 7*7*2
     self.object_scale = cfg.OBJECT_SCALE
     self.noobject_scale = cfg.NOOBJECT_SCALE
     self.class_scale = cfg.CLASS_SCALE
     self.coord_scale = cfg.COORD_SCALE
     self.ckpt_file = './yolo2_data/yolo2_coco.ckpt'  #cfg.CKPT_FILE
     self.learning_rate = cfg.LEARNING_RATE
     self.batch_size = cfg.BATCH_SIZE
     self.alpha = cfg.ALPHA
     self.lamda = cfg.LAMDA
     self.offset = np.transpose(
         np.reshape(
             np.array([np.arange(self.cell_size)] * self.cell_size *
                      self.boxes_per_cell),
             (self.boxes_per_cell, self.cell_size, self.cell_size)),
         (1, 2, 0))
     self.global_step = tf.Variable(0, name='global_step', trainable=False)
     self.build()
예제 #6
0
 def __init__(self, graph_path, vocab, max_caption_length):
   self.image_loader = ImageLoader('utils/ilsvrc_2012_mean.npy')
   self._vocab = vocab
   self._max_caption_length = max_caption_length
   self._graph = self.load_graph(graph_path)
   with self.open_session(self._graph) as self._sess:
     # inputs
     self._images = self._graph.get_tensor_by_name('images:0')
     self._contexts = self._graph.get_tensor_by_name('contexts:0')
     self._last_word = self._graph.get_tensor_by_name('last_word:0')
     self._last_memory = self._graph.get_tensor_by_name('last_memory:0')
     self._last_output = self._graph.get_tensor_by_name('last_output:0')
     # output
     self._conv_feats = self._graph.get_tensor_by_name('conv_feats:0')
     self._initial_memory = self._graph.get_tensor_by_name('initial_memory:0')
     self._initial_output = self._graph.get_tensor_by_name('initial_output:0')
     self._memory = self._graph.get_tensor_by_name('memory:0')
     self._output = self._graph.get_tensor_by_name('output:0')
     self._probs = self._graph.get_tensor_by_name('probs:0')
     self._alpha = self._graph.get_tensor_by_name('alpha:0')
예제 #7
0
 def __init__(self, config):
     self.config = config
     self.is_train = True if config.phase == 'train' else False
     self.train_cnn = self.is_train and config.train_cnn
     self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
     self.image_shape = [512, 512, 3]
     self.nn = NN(config)
     self.global_step = tf.Variable(0, name='global_step', trainable=False)
     self.build()
     self.record = open('lossrecord_oneloss.txt', 'w')
     self.predrecord = open(
         'predrecord__oneloss' + self.config.cnn + '_' +
         str(self.config.num_lstm_units) + '_' + '_.csv', 'w')
예제 #8
0
def get_feats():

    train_caption_file = 'D:/download/art_desc/train/ann.csv'
    eval_caption_file = 'D:/download/art_desc/val/ann.csv'
    train_features_dir = 'D:/download/art_desc/train/images_vgg/'
    eval_features_dir = 'D:/download/art_desc/val/images_vgg/'
    image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')

    net = VGG19(weights='imagenet')
    model = Model(input=net.input, output=net.get_layer('fc2').output)

    with open(eval_caption_file, 'r') as f:
        reader = csv.reader(f)
        for id, file_name, caption in reader:

            try:
                img = image_loader.load_image(file_name)
                fc2 = model.predict(img)
                reshaped = np.reshape(fc2, (4096))
                np.save(eval_features_dir + 'art_desc' + id, reshaped)
            except Exception:
                print("cannot identify image file:" + file_name)
                pass
예제 #9
0
class ATT_NIC(GraphLoader):
    def __init__(self, graph_path, vocab, max_caption_length):
        self.image_loader = ImageLoader('utils/ilsvrc_2012_mean.npy')
        self._vocab = vocab
        self._max_caption_length = max_caption_length
        self._graph = self.load_graph(graph_path)
        with self.open_session(self._graph) as self._sess:
            # inputs
            self._images = self._graph.get_tensor_by_name('images:0')
            self._contexts = self._graph.get_tensor_by_name('contexts:0')
            self._last_word = self._graph.get_tensor_by_name('last_word:0')
            self._last_memory = self._graph.get_tensor_by_name('last_memory:0')
            self._last_output = self._graph.get_tensor_by_name('last_output:0')
            # output
            self._conv_feats = self._graph.get_tensor_by_name('conv_feats:0')
            self._initial_memory = self._graph.get_tensor_by_name(
                'initial_memory:0')
            self._initial_output = self._graph.get_tensor_by_name(
                'initial_output:0')
            self._memory = self._graph.get_tensor_by_name('memory:0')
            self._output = self._graph.get_tensor_by_name('output:0')
            self._probs = self._graph.get_tensor_by_name('probs:0')
            self._alpha = self._graph.get_tensor_by_name('alpha:0')

    def get_sentence(self, result):
        word_idxs = result.sentence
        score = result.score
        score = score / (len(word_idxs) - 1)

        caption = self._vocab.get_sentence(word_idxs)
        results = {'caption': caption, 'score': score}
        return results

    def get_attention(self, result):

        return result.alphas

    def show_attention(self, caption, alphas, image_np, save_path):
        # alphas = result.alphas
        cap = caption['caption'].split()
        plt_w = 4
        plt_h = math.ceil((len(cap) + 1) / plt_w)
        im_width, im_height = image_np.shape[0:2]

        plt.figure(figsize=(10, 8))
        plt.subplot(plt_h, plt_w, 1)
        plt.imshow(image_np)
        plt.axis('off')
        # generate attention map for each word
        for idx in range(len(cap)):
            # assign weights for each region in the picture
            alpha_image = cv2.resize(alphas[idx].reshape(14, 14),
                                     (im_height, im_width))
            plt.subplot(plt_h, plt_w, idx + 2)
            lab = cap[idx]
            plt.text(0,
                     1,
                     lab,
                     backgroundcolor='white',
                     color='black',
                     fontsize=8)
            plt.imshow(image_np)
            plt.imshow(alpha_image, alpha=0.8)
            plt.set_cmap(cm.Greys_r)
            plt.axis('off')
            plt.subplots_adjust(left=0.02,
                                bottom=0.02,
                                right=0.98,
                                top=0.98,
                                hspace=0.05,
                                wspace=0.05)

        plt.savefig(save_path)

    def decode(self, filename):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        images = self.image_loader.load_images([filename])
        contexts, initial_memory, initial_output = self._sess.run(
            [self._conv_feats, self._initial_memory, self._initial_output],
            feed_dict={self._images: images})

        def _inference_step_fn(last_word, last_memory, last_output):
            return self._sess.run(
                [self._memory, self._output, self._probs, self._alpha],
                feed_dict={
                    self._contexts: contexts,
                    self._last_word: last_word,
                    self._last_memory: last_memory,
                    self._last_output: last_output
                })

        # generate caption for each picture
        bs = BeamSearch(3, self._max_caption_length, self._vocab.start_id,
                        self._vocab.end_id, 1)
        # Run beam search
        result = bs.search(_inference_step_fn, initial_memory, initial_output)
        caption = self.get_sentence(result[0])
        attention = self.get_attention(result[0])
        return caption, attention
예제 #10
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.build()

    def build(self):
        raise NotImplementedError()

    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step) + ".npy")

        print("Loading the model from %s..." % save_path)
        data_dict = np.load(save_path, allow_pickle=True,
                            encoding="bytes").item()
        count = 0
        for v in tqdm(tf.compat.v1.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1
        print("%d tensors loaded." % count)
예제 #11
0
class DataLoader():
    def __init__(self, config, batch_size, seq_length, end_token=0):
        self.config = config
        self.batch_size = batch_size
        self.token_stream = []
        self.seq_length = seq_length
        self.end_token = end_token
        self.image_batch = None
        self.feature_batch = None
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        net = VGG19(weights='imagenet')
        self.trained_model = Model(input= net.input, output= net.get_layer('fc2').output)

    def get_imagefeatures_vgg19(self, image_files, feature_files):
        #print("to extract features...")
        return self.image_loader.extract_features_vgg19(self.trained_model, image_files, feature_files, self.batch_size) #extract image features using vgg19


    def create_batches(self, config, with_image):
        self.token_stream = []

        with open(config.temp_oracle_file, 'r') as raw:
            for line in raw:
                line = line.strip().split()
                parse_line = [int(x) for x in line]
                if len(parse_line) > self.seq_length:
                    self.token_stream.append(parse_line[:self.seq_length])
                else:
                    while len(parse_line) < self.seq_length:
                        parse_line.append(self.end_token)
                    if len(parse_line) == self.seq_length:
                        self.token_stream.append(parse_line)

        self.num_batch = int(len(self.token_stream) / self.batch_size)
        self.token_stream = self.token_stream[:self.num_batch * self.batch_size]
        self.sequence_batch = np.split(np.array(self.token_stream), self.num_batch, 0)
        self.pointer = 0

        if with_image:
            with open(config.temp_image_file) as ifile:
                self.image_files = ifile.read().splitlines()
            with open(config.temp_feature_file) as ffile:
                self.feature_files = ffile.read().splitlines()

            self.image_files = self.image_files[:self.num_batch * self.batch_size]
            self.image_batch = np.split(np.array(self.image_files), self.num_batch, 0)
            self.feature_files = self.feature_files[:self.num_batch * self.batch_size]
            self.feature_batch = np.split(np.array(self.feature_files), self.num_batch, 0)

    def create_batches_v2(self, config, with_image):
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']

        self.num_batch = int(len(word_idxs) / self.batch_size)
        word_idxs = word_idxs[:self.num_batch * self.batch_size]
        self.sequence_batch = np.split(word_idxs, self.num_batch, 0)
        self.pointer = 0

        if with_image:
            with open(config.temp_image_file) as ifile:
                self.image_files = ifile.read().splitlines()
            with open(config.temp_feature_file) as ffile:
                self.feature_files = ffile.read().splitlines()

            self.image_files = self.image_files[:self.num_batch * self.batch_size]
            self.image_batch = np.split(np.array(self.image_files), self.num_batch, 0)
            self.feature_files = self.feature_files[:self.num_batch * self.batch_size]
            self.feature_batch = np.split(np.array(self.feature_files), self.num_batch, 0)

    def next_batch(self):
        ret = self.sequence_batch[self.pointer]
        imgs = None
        features = None
        if self.image_batch:
            imgs = self.image_batch[self.pointer]
            feature_files = self.feature_batch[self.pointer]
            features = self.get_imagefeatures_vgg19(imgs, feature_files)
        else:
            print("no image files")
        self.pointer = (self.pointer + 1) % self.num_batch

        return ret, features

    def reset_pointer(self):
        self.pointer = 0
예제 #12
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.classes = cfg.CLASSES
        self.num_class = 80  #len(self.classes)
        self.num_anchor = cfg.NUM_ANCHOR
        self.anchors = cfg.ANCHORS
        self.image_size = cfg.IMAGE_SIZE
        self.cell_size = cfg.CELL_SIZE
        self.boxes_per_cell = cfg.BOXES_PER_CELL
        #self.output_size = (self.cell_size * self.cell_size) *\
        #(self.num_class + self.boxes_per_cell * 5)
        #7*7*(20+10)
        self.output_size = (self.num_class + 5) * self.num_anchor
        self.scale = 1.0 * self.image_size / self.cell_size
        self.boundary1 = self.cell_size * self.cell_size * self.num_class
        #7*7*20
        self.use_pretrain = True
        self.boundary2 = self.boundary1 +\
            self.cell_size * self.cell_size * self.boxes_per_cell
        #7*7*20 + 7*7*2
        self.object_scale = cfg.OBJECT_SCALE
        self.noobject_scale = cfg.NOOBJECT_SCALE
        self.class_scale = cfg.CLASS_SCALE
        self.coord_scale = cfg.COORD_SCALE
        self.ckpt_file = './yolo2_data/yolo2_coco.ckpt'  #cfg.CKPT_FILE
        self.learning_rate = cfg.LEARNING_RATE
        self.batch_size = cfg.BATCH_SIZE
        self.alpha = cfg.ALPHA
        self.lamda = cfg.LAMDA
        self.offset = np.transpose(
            np.reshape(
                np.array([np.arange(self.cell_size)] * self.cell_size *
                         self.boxes_per_cell),
                (self.boxes_per_cell, self.cell_size, self.cell_size)),
            (1, 2, 0))
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.build()

    def build(self):
        raise NotImplementedError()

    def train(self, sess, train_data):
        """ Train the model using the COCO train2014 data. """
        print("Training the model...")
        config = self.config

        if not os.path.exists(config.summary_dir):
            os.mkdir(config.summary_dir)
        train_writer = tf.summary.FileWriter(config.summary_dir, sess.graph)

        for _ in tqdm(list(range(config.num_epochs)), desc='epoch'):
            for _ in tqdm(list(range(train_data.num_batches)), desc='batch'):
                batch = train_data.next_batch()
                image_files, sentences, masks = batch
                images = self.image_loader.load_images(image_files)
                feed_dict = {
                    self.images: images,
                    self.sentences: sentences,
                    self.masks: masks
                }
                _, summary, global_step = sess.run(
                    [self.opt_op, self.summary, self.global_step],
                    feed_dict=feed_dict)
                if (global_step + 1) % config.save_period == 0:
                    self.save()
                train_writer.add_summary(summary, global_step)
            train_data.reset()

        self.save()
        train_writer.close()
        print("Training complete.")

    def test(self, sess, images, vocabulary):
        """ Test the model using any given images. """
        print("Testing the model ...")
        config = self.config

        if not os.path.exists(config.test_result_dir):
            os.mkdir(config.test_result_dir)

        captions = []
        scores = []
        caption_data = self.beam_search(sess, images)
        word_idxs = caption_data[0][0].sentence
        score = caption_data[0][0].score
        caption = vocabulary.get_sentence(word_idxs)
        print(caption)
        return caption

    def beam_search(self, sess, image):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        scale_shape = np.array([224, 224], np.int32)
        crop_shape = np.array([224, 224], np.int32)
        image = cv2.resize(image, (scale_shape[0], scale_shape[1]))
        offset = (scale_shape - crop_shape) / 2
        offset = offset.astype(np.int32)
        image = image[offset[0]:offset[0] + crop_shape[0],
                      offset[1]:offset[1] + crop_shape[1]]
        image = np.expand_dims(image, 0)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: image})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results

    def save(self):
        """ Save the model. """
        config = self.config
        data = {v.name: v.eval() for v in tf.global_variables()}
        save_path = os.path.join(config.save_dir, str(self.global_step.eval()))

        print((" Saving the model to %s..." % (save_path + ".npy")))
        np.save(save_path, data)
        info_file = open(os.path.join(config.save_dir, "config.pickle"), "wb")
        config_ = copy.copy(config)
        config_.global_step = self.global_step.eval()
        pickle.dump(config_, info_file)
        info_file.close()
        print("Model saved.")

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step) + ".npy")

        print("Loading the model from %s..." % save_path)
        data_dict = np.load(save_path).item()
        count = 0
        for v in tqdm(tf.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1
        print("%d tensors loaded." % count)

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        print("Loading the CNN from %s..." % data_path)
        data_dict = np.load(data_path).item()
        count = 0
        for op_name in tqdm(data_dict):
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in data_dict[op_name].iteritems():
                    try:
                        var = tf.get_variable(param_name)
                        session.run(var.assign(data))
                        count += 1
                    except ValueError:
                        pass
        print("%d tensors loaded." % count)
예제 #13
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.global_epoch = 0
        self.build()

    def build(self):
        raise NotImplementedError()

    def train(self, sess, train_data):
        """ Train the model using the COCO train2014 data. """
        print("Training the model...")
        config = self.config

        if not os.path.exists(config.summary_dir):
            os.mkdir(config.summary_dir)
        train_writer = tf.summary.FileWriter(config.summary_dir, sess.graph)

        Loss_table = []
        Loss_line = []
        for _ in tqdm(list(range(config.num_epochs)), desc='epoch'):
            for _ in tqdm(list(range(train_data.num_batches)), desc='batch'):
                batch = train_data.next_batch()
                image_files, sentences, masks = batch
                images = self.image_loader.load_images(image_files)
                feed_dict = {
                    self.images: images,
                    self.sentences: sentences,
                    self.masks: masks
                }
                Loss, summary, global_step = sess.run(
                    [self.opt_op, self.summary, self.global_step],
                    feed_dict=feed_dict)
                Loss_line.append(Loss)
                # if (global_step + 1) % config.save_period == 0:
                #     self.save()
                # train_writer.add_summary(summary, global_step)
                if (global_step + 1) % 10 == 0:
                    train_writer.add_summary(summary, global_step)
            Loss_table.append(Loss_line)
            Loss_line = []
            self.save()
            train_data.reset()

        file = open('ResultProcess/Loss/Loss.pickle', 'wb')
        pickle.dump(Loss_table, file)
        file.close()

        self.save()
        train_writer.close()
        print("Training complete.")

    def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
        """ Evaluate the model using the COCO val2014 data. """
        print("Evaluating the model ...")
        config = self.config

        results = []
        if not os.path.exists(config.eval_result_dir):
            os.mkdir(config.eval_result_dir)

########################################################################################
# Generate the captions for the images
        idx = 0
        for k in tqdm(list(range(eval_data.num_batches)), desc='batch'):
            batch = eval_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<eval_data.num_batches-1 \
                         else eval_data.fake_count
            for l in range(eval_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                results.append({
                    'image_id': int(eval_data.image_ids[idx]),
                    'caption': caption
                })
                idx += 1

                # Save the result in an image file, if requested
                if config.save_eval_result_as_image:
                    image_file = batch[l]
                    image_name = image_file.split(os.sep)[-1]
                    image_name = os.path.splitext(image_name)[0]
                    img = plt.imread(image_file)
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(caption)
                    plt.savefig(
                        os.path.join(
                            # config.eval_result_dir,
                            image_name + '_result.jpg'))
        fp = open(config.eval_result_file, 'w')
        json.dump(results, fp)
        #json_dict = json.dumps(results)
        #fp.write(json_dict)
        fp.close()
        #######################################################################################################
        print('json done')
        # Evaluate these captions
        eval_result_coco = eval_gt_coco.loadRes(config.eval_result_file)
        scorer = COCOEvalCap(eval_gt_coco, eval_result_coco)
        scorer.evaluate()
        print("Evaluation complete.")

    def test(self, sess, test_data, vocabulary):
        """ Test the model using any given images. """
        print("Testing the model ...")
        config = self.config

        if not os.path.exists(config.test_result_dir):
            os.mkdir(config.test_result_dir)

        captions = []
        scores = []

        # Generate the captions for the images
        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
            batch = test_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                captions.append(caption)
                scores.append(score)

                # Save the result in an image file
                image_file = batch[l]
                image_name = image_file.split(os.sep)[-1]
                image_name = os.path.splitext(image_name)[0]
                img = plt.imread(image_file)
                plt.imshow(img)
                plt.axis('off')
                plt.title(caption)
                plt.savefig(
                    os.path.join(config.test_result_dir,
                                 image_name[-1] + '_result.jpg'))

        # Save the captions to a file
        results = pd.DataFrame({
            'image_files': test_data.image_files,
            'caption': captions,
            'prob': scores
        })
        results.to_csv(config.test_result_file)
        print("Testing complete.")

    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []

        alpha_all = []
        last_output_all = []

        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

            results = []

            alpha_status = sess.run(self.alpha,
                                    feed_dict={
                                        self.contexts: contexts,
                                        self.last_memory: last_memory,
                                        self.last_output: last_output
                                    })

            alpha_all.append(alpha_status)
            last_output_all.append(last_output)

        file = open('ResultProcess/alpha.pickle', 'wb')
        pickle.dump(alpha_all, file)
        file.close()
        # file = open('ResultProcess/last_output_all.pickle', 'wb')
        # pickle.dump(last_output_all, file)
        # file.close()

        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results

    def save(self):
        """ Save the model. """
        config = self.config
        data = {v.name: v.eval() for v in tf.global_variables()}
        save_path = os.path.join(config.save_dir, str(self.global_epoch))
        self.global_epoch += 1

        print((" Saving the model to %s..." % (save_path + ".npy")))
        np.save(save_path, data)
        info_file = open(os.path.join(config.save_dir, "config.pickle"), "wb")
        config_ = copy.copy(config)
        config_.global_step = self.global_step.eval()
        pickle.dump(config_, info_file)
        info_file.close()
        print("Model saved.")

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step) + ".npy")

        print("Loading the model from %s..." % save_path)
        data_dict = np.load(save_path).item()
        count = 0
        for v in tqdm(tf.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1
        print("%d tensors loaded." % count)

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        print("Loading the CNN from %s..." % data_path)
        data_dict = np.load(data_path, encoding='latin1').item()
        del data_dict['fc6']
        del data_dict['fc7']
        del data_dict['fc8']

        count = 0
        for op_name in tqdm(data_dict):
            with tf.variable_scope(op_name, reuse=True):
                # for param_name, data in data_dict[op_name].iteritems():
                # for param_name, data in data_dict[op_name].items():
                # for param_name, data in data_dict[op_name]:
                #     try:
                #         var = tf.get_variable(param_name)
                #         session.run(var.assign(data))
                #         count += 1
                #     except ValueError:
                #         pass
                data = data_dict[op_name]
                try:
                    var_k = tf.get_variable('kernel')
                    session.run(var_k.assign(data[0]))
                    var_b = tf.get_variable('bias')
                    session.run(var_b.assign(data[1]))
                    count += 1
                except ValueError as e:
                    assert e
                    pass
        print("%d tensors loaded." % count)

    def load_cnn_ckpt(self, session, data_path):
        print("Loading the CNN from %s..." % data_path)
        reader = pywrap_tensorflow.NewCheckpointReader(data_path)
        var_to_shape_map = reader.get_variable_to_shape_map()
        del var_to_shape_map['global_step']
        del var_to_shape_map['resnet_v1_50/mean_rgb']
        del var_to_shape_map['resnet_v1_50/logits/weights']
        del var_to_shape_map['resnet_v1_50/logits/biases']
        count = 0
        for key in tqdm(var_to_shape_map):
            ckpt_tensor = reader.get_tensor(key)
            tensor_name = self.get_tensor_name(key.split('/')).split('/')
            with tf.variable_scope(tensor_name[0], reuse=True):
                var_ = tf.get_variable(tensor_name[1])
                session.run(var_.assign(ckpt_tensor))
                count += 1
        with tf.variable_scope('conv1', reuse=True):
            var_ = tf.get_variable('bias')
            session.run(var_.assign(np.zeros([64])))
            count += 1
        print("%d tensors loaded." % count)

    def get_tensor_name(self, keys):
        name = ''
        flag = False
        n = 0
        for item in ['block1', 'block2', 'block3', 'block4']:
            if item in keys:
                flag = True
            if flag:
                if 'BatchNorm' in keys:
                    name += 'bn' + str(n + 2)
                else:
                    name += 'res' + str(n + 2)
                break
            elif n == 3:
                if 'BatchNorm' in keys:
                    name += 'bn_conv1/'
                else:
                    name += 'conv1/'
            n += 1

        flag = False
        n = 0
        for item in [
                'unit_1', 'unit_2', 'unit_3', 'unit_4', 'unit_5', 'unit_6'
        ]:
            if item in keys:
                flag = True
            if flag:
                name += chr(97 + n) + '_'
                break
            n += 1

        flag = False
        n = 0
        for item in ['shortcut', 'conv1', 'conv2', 'conv3']:
            if item in keys:
                flag = True
            if flag and (name[-2:-7:-1] != '1vnoc'):
                if n == 0:
                    name += 'branch1'
                else:
                    name += 'branch2' + chr(96 + n)
                name += '/'
                break
            n += 1

        flag = False
        n = 0
        for item in [
                'weights', 'gamma', 'beta', 'moving_mean', 'moving_variance'
        ]:
            if item in keys:
                flag = True
            if flag:
                if n == 0:
                    name += 'kernel'
                else:
                    name += item
                break
            n += 1

        return name
예제 #14
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./DeepRNN/utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.build()

    def build(self):
        raise NotImplementedError()

    def test(self, sess, test_data, vocabulary):
        """ Test the model using any given images. """
        config = self.config

        # Generate the captions for the images
        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
            batch = test_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                print('**' + caption + '**')

    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step) + ".npy")

        data_dict = np.load(save_path).item()
        count = 0
        for v in tqdm(tf.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        data_dict = np.load(data_path).item()
        count = 0
        for op_name in tqdm(data_dict):
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in data_dict[op_name].iteritems():
                    try:
                        var = tf.get_variable(param_name)
                        session.run(var.assign(data))
                        count += 1
                    except ValueError:
                        pass
예제 #15
0
class DisDataloader():
    def __init__(self, config, batch_size, seq_length):
        self.config = config
        self.batch_size = batch_size
        self.sentences = np.array([])
        self.labels = np.array([])
        self.seq_length = seq_length

        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        net = VGG19(weights='imagenet')
        self.trained_model = Model(input=net.input,
                                   output=net.get_layer('fc2').output)

    def get_imagefeatures_vgg19(self, image_files, feature_files):
        #print("to extract features...")
        return self.image_loader.extract_features_vgg19(
            self.trained_model, image_files, feature_files, self.batch_size)

    def load_train_data(self, with_image):
        # Load data
        #pos: oracle, neg: generated samples
        data = np.load(self.config.temp_generate_file).item()
        #data = {'feature_files': feature_files, 'real_samples': real_samples, 'generated_samples': generated_samples}
        positive_examples = data['real_samples']
        negative_examples = data['generated_samples']
        feature_files = data['feature_files']
        image_files = data['image_files']

        if with_image:  #same order as postive and negative examples
            feature_files = np.concatenate([feature_files, feature_files], 0)
            image_files = np.concatenate([image_files, image_files], 0)

        # Generate labels
        positive_labels = [[0, 1] for _ in positive_examples]
        negative_labels = [[1, 0] for _ in negative_examples]
        self.labels = np.concatenate([positive_labels, negative_labels], 0)

        # Split batches

        #self.sentences = np.array(positive_examples + negative_examples)
        self.sentences = np.concatenate([positive_examples, negative_examples],
                                        0)
        self.num_batch = int(len(self.labels) / self.batch_size)
        self.sentences = self.sentences[:self.num_batch * self.batch_size]
        self.labels = self.labels[:self.num_batch * self.batch_size]
        #debug = self.labels
        if with_image:

            feature_files = feature_files[:self.num_batch * self.batch_size]
            image_files = image_files[:self.num_batch * self.batch_size]
            self.labels, self.sentences, feature_files, image_files = shuffle(
                self.labels, self.sentences, feature_files, image_files)

            self.feature_batch = np.split(np.array(feature_files),
                                          self.num_batch, 0)
            self.image_batch = np.split(np.array(image_files), self.num_batch,
                                        0)
        else:
            self.labels, self.sentences = shuffle(self.labels, self.sentences)

        self.sentences_batches = np.split(self.sentences, self.num_batch, 0)
        self.labels_batches = np.split(self.labels, self.num_batch, 0)

        self.pointer = 0

    def print_sample(self, array):
        return
        for i in range(10):
            print(str(array[i]))

    def next_batch(self):
        sent = self.sentences_batches[self.pointer]
        lab = self.labels_batches[self.pointer]
        imgs = None
        features = None
        if self.image_batch:
            imgs = self.image_batch[self.pointer]
            feature_files = self.feature_batch[self.pointer]
            features = self.get_imagefeatures_vgg19(imgs, feature_files)
        else:
            print("no image files")

        self.pointer = (self.pointer + 1) % self.num_batch
        return sent, lab, features

    def reset_pointer(self):
        self.pointer = 0
예제 #16
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        #self.is_train = True
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0,
                                       name = 'global_step',
                                       trainable = False)

        self.build()

    def get_imagefeatures(self, image_files, batch_size):
        return self.image_loader.extract_features_vgg19(self.trained_model, image_files, batch_size)

    def build(self):
        # use pretrained vgg model to extract image features
        net = VGG19(weights='imagenet')
        self.trained_model = Model(input= net.input, output= net.get_layer('fc2').output)

        self.object_model = self.get_mod()
        self.object_model.load_params('./img2poem/model/object.params')

        self.sentiment_model = self.get_mod(sym = img2poem.symbol_sentiment.get_sym(), img_len = 227)
        self.sentiment_model.load_params('./img2poem/model/Sentiment.params')

        self.scene_model = self.get_mod()
        self.scene_model.load_params('./img2poem/model/scene.params')
        #self.sentiment_model = Model(input= net.input, output= net.get_layer('block3_conv1').output)

    def get_mod(self, output_name = 'relu7_output', sym = None, img_len = 224):
        if sym is None:
            vgg = VGG()
            sym = vgg.get_symbol(num_classes = 1000, 
                      blocks = [(2, 64),
                                (2, 128),
                                (3, 256), 
                                (3, 512),
                                (3, 512)])
            internals = sym.get_internals()
            sym = internals[output_name]
        ctx = mx.cpu()
        mod = mx.module.Module(
                context = ctx,
                symbol = sym,
                data_names = ("data", ),
                label_names = ()
        )

        mod.bind(data_shapes = [("data", (1, 3, 224, 224))], for_training = False)

        return mod

    def train(self, sess, train_data):
        raise NotImplementedError()

    def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
        """ Evaluate the model using the COCO val2014 data. """
        print("Evaluating the model ...")
        config = self.config

        results = []
        if not os.path.exists(config.eval_result_dir):
            os.mkdir(config.eval_result_dir)

        # Generate the captions for the images
        idx = 0
        for k in tqdm(list(range(eval_data.num_batches)), desc='batch'):
        #for k in range(1):
            batch = eval_data.next_batch()
            #caption_data = self.beam_search(sess, batch, vocabulary)
            images = self.image_loader.load_images(batch)
            caption_data, scores = sess.run([self.predictions, self.probs], feed_dict={self.images: images})
            fake_cnt = 0 if k<eval_data.num_batches-1 \
                         else eval_data.fake_count
            for l in range(eval_data.batch_size-fake_cnt):
                ## self.predictions will return the indexes of words, we need to find the corresponding word from it.
                word_idxs = caption_data[l]
                ## get_sentence will return a sentence till there is a end delimiter which is '.'
                caption = str(vocabulary.get_sentence(word_idxs))
                results.append({'image_id': int(eval_data.image_ids[idx]),
                                'caption': caption})
                #print(results)
                idx += 1

                # Save the result in an image file, if requested
                if config.save_eval_result_as_image:
                    image_file = batch[l]
                    image_name = image_file.split(os.sep)[-1]
                    image_name = os.path.splitext(image_name)[0]
                    img = mpimg.imread(image_file)
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(caption)
                    plt.savefig(os.path.join(config.eval_result_dir,
                                             image_name+'_result.jpg'))

        fp = open(config.eval_result_file, 'w')
        json.dump(results, fp)
        fp.close()

        # Evaluate these captions
        eval_result_coco = eval_gt_coco.loadRes(config.eval_result_file)
        scorer = COCOEvalCap(eval_gt_coco, eval_result_coco)
        scorer.evaluate()
        print("Evaluation complete.")

    def test(self, sess, test_data, vocabulary):
        """ Test the model using any given images. """
        print("Testing the model ...")
        config = self.config

        if not os.path.exists(config.test_result_dir):
            os.mkdir(config.test_result_dir)

        captions = []
        scores = []

        # Generate the captions for the images
        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
            batch = test_data.next_batch()
            images = self.image_loader.load_images(batch)
            caption_data,scores_data = sess.run([self.predictions,self.probs],feed_dict={self.images:images})

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size-fake_cnt):
                ## self.predictions will return the indexes of words, we need to find the corresponding word from it.
                word_idxs = caption_data[l]
                ## get_sentence will return a sentence till there is a end delimiter which is '.'
                caption = vocabulary.get_sentence(word_idxs)
                print(caption)
                captions.append(caption)
                scores.append(scores_data[l])

                # Save the result in an image file
                image_file = batch[l]
                image_name = image_file.split(os.sep)[-1]
                image_name = os.path.splitext(image_name)[0]
                img = mpimg.imread(image_file)
                plt.imshow(img)
                plt.axis('off')
                plt.title(caption)
                plt.savefig(os.path.join(config.test_result_dir,
                                         image_name+'_result.jpg'))

        ##Save the captions to a file
        results = pd.DataFrame({'image_files':test_data.image_files,
                                'caption':captions,
                                'prob':scores})
        results.to_csv(config.test_result_file)
        print("Testing complete.")

    def save(self):
        """ Save the model. """
        config = self.config
        data = {v.name: v.eval() for v in tf.global_variables()}
        save_path = os.path.join(config.save_dir, str(self.global_step.eval()))

        print((" Saving the model to %s..." % (save_path+".npy")))
        np.save(save_path, data)
        info_file = open(os.path.join(config.save_dir, "config.pickle"), "wb")
        config_ = copy.copy(config)
        config_.global_step = self.global_step.eval()
        pickle.dump(config_, info_file)
        info_file.close()
        print("Model saved.")

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step)+".npy")

        print("Loading the model from %s..." %save_path)
        data_dict = np.load(save_path).item()
        count = 0
        for v in tqdm(tf.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1
        print("%d tensors loaded." %count)

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        print("All variables present...")
        for var in tf.all_variables():
            print(var)
        with tf.variable_scope('conv1_1',reuse = True):
            kernel = tf.get_variable('conv1_1_W')

        print("Loading the CNN from %s..." %data_path)
        data_dict = np.load(data_path,encoding='latin1')
        count = 0
        for param_name in tqdm(data_dict.keys()):
            op_name = param_name[:-2]
            print(param_name)
            #print(op_name)
            with tf.variable_scope(op_name, reuse = True):
                try:
                    var = tf.get_variable(param_name)
                    session.run(var.assign(data_dict[param_name]))
                    count += 1
                except ValueError:
                    print("No such variable")
                    pass

        print("%d tensors loaded." %count)
예제 #17
0
class DataTestLoader():
    def __init__(self, config, batch_size, end_token=0):
        self.config = config
        self.batch_size = batch_size
        self.end_token = end_token
        self.image_batch = None
        self.feature_batch = None
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        net = VGG19(weights='imagenet')
        self.trained_model = Model(input=net.input,
                                   output=net.get_layer('fc2').output)

    def get_imagefeatures_vgg19(self, image_files, feature_files):
        #print("to extract features...")
        return self.image_loader.extract_features_vgg19(
            self.trained_model, image_files, feature_files,
            self.batch_size)  #extract image features using vgg19

    def next_batch(self):
        imgs = None
        features = None

        if self.image_batch is not None:
            imgs = self.image_batch[self.pointer]
            feat_file = self.feature_batch[self.pointer]
            conv = np.array(self.get_imagefeatures_vgg19(imgs, feat_file))
        else:
            print("no image files")

        self.pointer = (self.pointer + 1) % self.num_batch

        return imgs, feat_file, conv

    def reset_pointer(self):
        self.pointer = 0

    def create_batches(self, with_image=True):

        self.pointer = 0
        config = self.config

        if with_image:
            data = pd.read_csv(config.test_temp_file)
            image_files = []
            feature_files = []
            for _, img, feat in data.values:
                image_files.append(img)
                feature_files.append(feat)
            #print("len image files: " + str(len(image_files)))
            #print("len feature files: " + str(len(feature_files)))
            self.num_batch = int(len(image_files) / self.batch_size)
            #print("num batch" + str(self.num_batch))

            image_files = image_files[:self.num_batch * self.batch_size]
            feature_files = feature_files[:self.num_batch * self.batch_size]

            #print("len image files: " + str(len(image_files)))
            #print("len feature files: " + str(len(feature_files)))

            self.image_batch = np.array(
                np.split(np.array(image_files), self.num_batch, 0))
            self.feature_batch = np.array(
                np.split(np.array(feature_files), self.num_batch, 0))

        else:
            image_files = None
            feature_files = None

    def get_sample_features(self):
        data = np.load(config.temp_sample_image_file).item()
        imgs = data['images']
        features = data['features']

        return imgs, features

    def reset_image_pointer(self):
        self.image_pointer = 0
예제 #18
0
class DataLoader():
    def __init__(self, config, batch_size, seq_length, end_token=0):
        self.config = config
        self.batch_size = batch_size
        self.token_stream = []
        self.seq_length = seq_length
        self.end_token = end_token
        self.image_batch = None
        self.feature_batch = None
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        net = VGG19(weights='imagenet')
        self.trained_model = Model(input=net.input,
                                   output=net.get_layer('fc2').output)

    def get_imagefeatures_vgg19(self, image_files, feature_files):
        #print("to extract features...")
        return self.image_loader.extract_features_vgg19(
            self.trained_model, image_files, feature_files,
            self.batch_size)  #extract image features using vgg19

    def next_batch(self):
        seq = self.sequence_batch[self.pointer]
        imgs = None
        features = None

        if self.image_batch is not None:
            imgs = self.image_batch[self.pointer]
            feat_file = self.feature_batch[self.pointer]
            conv = np.array(self.get_imagefeatures_vgg19(imgs, feat_file))
        else:
            print("no image files")

        self.pointer = (self.pointer + 1) % self.num_batch

        return seq, imgs, feat_file, conv

    def reset_pointer(self):
        self.pointer = 0

    def create_shuffled_batches(self, with_image=True):

        self.pointer = 0
        config = self.config

        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']
        print("len word_idxs: " + str(len(word_idxs)))

        self.num_batch = int(len(word_idxs) / self.batch_size)
        print("num batch " + str(self.num_batch))
        print('batch_size' + str(self.batch_size))
        print(self.num_batch * self.batch_size)
        word_idxs = word_idxs[:self.num_batch * self.batch_size]

        #self.pointer = 0

        if with_image:
            with open(config.temp_image_file) as ifile:
                image_files = ifile.read().splitlines()
            with open(config.temp_feature_file) as ffile:
                feature_files = ffile.read().splitlines()

            image_files = image_files[:self.num_batch * self.batch_size]
            feature_files = feature_files[:self.num_batch * self.batch_size]

            print("len image files: " + str(len(image_files)))
            print("len feature files: " + str(len(feature_files)))
            print("len word_idxs: " + str(len(word_idxs)))

            word_idxs, feature_files, image_files = shuffle(
                word_idxs, feature_files, image_files)

            self.sequence_batch = np.array(
                np.split(word_idxs, self.num_batch, 0))
            self.image_batch = np.array(
                np.split(np.array(image_files), self.num_batch, 0))
            self.feature_batch = np.array(
                np.split(np.array(feature_files), self.num_batch, 0))

        else:
            image_files = None
            feature_files = None
            word_idxs = shuffle(word_idxs)
            self.sequence_batch = np.split(word_idxs, self.num_batch, 0)

        print('shape of sequence_batch:' + str(self.sequence_batch.shape))
        print('shape of image:' + str(self.image_batch.shape))
        print('shape of features:' + str(self.feature_batch.shape))

    def get_sample_features(self):
        data = np.load(config.temp_sample_image_file).item()
        imgs = data['images']
        features = data['features']

        return imgs, features

    def reset_image_pointer(self):
        self.image_pointer = 0
예제 #19
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.build()

    def build(self):
        raise NotImplementedError()

    def train(self, sess, train_data):
        """ Train the model using the COCO train2014 data. """
        print("Training the model...")
        config = self.config

        if not os.path.exists(config.summary_dir):
            os.mkdir(config.summary_dir)
        train_writer = tf.summary.FileWriter(config.summary_dir, sess.graph)

        for _ in tqdm(list(range(config.num_epochs)), desc='epoch'):
            for _ in tqdm(list(range(train_data.num_batches)), desc='batch'):
                batch = train_data.next_batch()
                image_files, sentences, masks = batch
                images = self.image_loader.load_images(image_files)
                feed_dict = {
                    self.images: images,
                    self.sentences: sentences,
                    self.masks: masks
                }
                _, summary, global_step = sess.run(
                    [self.opt_op, self.summary, self.global_step],
                    feed_dict=feed_dict)
                if (global_step + 1) % config.save_period == 0:
                    self.save()
                train_writer.add_summary(summary, global_step)
            train_data.reset()

        self.save()
        train_writer.close()
        print("Training complete.")

    def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
        """ Evaluate the model using the COCO val2014 data. """
        print("Evaluating the model ...")
        config = self.config

        results = []
        if not os.path.exists(config.eval_result_dir):
            os.mkdir(config.eval_result_dir)

        # Generate the captions for the images
        idx = 0
        for k in tqdm(list(range(eval_data.num_batches)), desc='batch'):
            batch = eval_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<eval_data.num_batches-1 \
                         else eval_data.fake_count
            for l in range(eval_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                results.append({
                    'image_id': int(eval_data.image_ids[idx]),
                    'caption': caption
                })
                idx += 1

                # Save the result in an image file, if requested
                if config.save_eval_result_as_image:
                    image_file = batch[l]
                    image_name = image_file.split(os.sep)[-1]
                    image_name = os.path.splitext(image_name)[0]
                    img = plt.imread(image_file)
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(caption)
                    plt.savefig(
                        os.path.join(config.eval_result_dir,
                                     image_name + '_result.jpg'))

        fp = open(config.eval_result_file, 'w')
        json.dump(results, fp)
        fp.close()

        # Evaluate these captions
        eval_result_coco = eval_gt_coco.loadRes(config.eval_result_file)
        scorer = COCOEvalCap(eval_gt_coco, eval_result_coco)
        scorer.evaluate()
        print("Evaluation complete.")

    def test(self, sess, test_data, vocabulary):
        """ Test the model using any given images. """
        print("Testing the model ...")
        config = self.config

        if not os.path.exists(config.test_result_dir):
            os.mkdir(config.test_result_dir)

        captions = []
        scores = []

        # Generate the captions for the images
        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
            batch = test_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                captions.append(caption)
                scores.append(score)

                # Save the result in an image file
                image_file = batch[l]

                # image_name = image_file.split(os.sep)[-1]
                # image_name = os.path.splitext(image_name)[0]

                image_name = os.path.basename(image_file)
                image_name = os.path.splitext(image_name)[0]

                img = plt.imread(image_file)
                plt.imshow(img)
                plt.axis('off')
                plt.title(caption)
                plt.savefig(
                    os.path.join(config.test_result_dir,
                                 image_name + '_result.jpg'))

        # Save the captions to a file
        results = pd.DataFrame({
            'image_files': test_data.image_files,
            'caption': captions,
            'prob': scores
        })
        results.to_csv(config.test_result_file)
        print("Testing complete.")

    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results

    def save(self):
        """ Save the model. """
        config = self.config
        data = {v.name: v.eval() for v in tf.global_variables()}
        save_path = os.path.join(config.save_dir, str(self.global_step.eval()))

        print((" Saving the model to %s..." % (save_path + ".npy")))
        np.save(save_path, data)
        info_file = open(os.path.join(config.save_dir, "config.pickle"), "wb")
        config_ = copy.copy(config)
        config_.global_step = self.global_step.eval()
        pickle.dump(config_, info_file)
        info_file.close()
        print("Model saved.")

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step) + ".npy")

        print("Loading the model from %s..." % save_path)
        data_dict = np.load(save_path).item()
        count = 0
        for v in tqdm(tf.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1
        print("%d tensors loaded." % count)

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        print("Loading the CNN from %s..." % data_path)
        data_dict = np.load(data_path).item()
        count = 0
        for op_name in tqdm(data_dict):
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in data_dict[op_name].iteritems():
                    try:
                        var = tf.get_variable(param_name)
                        session.run(var.assign(data))
                        count += 1
                    except ValueError:
                        pass
        print("%d tensors loaded." % count)
예제 #20
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.build()

    def build(self):
        raise NotImplementedError()

    def train(self, sess, train_data):
        """ Train the model using the COCO train2014 data. """
        print("Training the model...")
        config = self.config

        if not os.path.exists(config.summary_dir):
            os.mkdir(config.summary_dir)
        train_writer = tf.summary.FileWriter(config.summary_dir, sess.graph)

        for _ in tqdm(list(range(config.num_epochs)), desc='epoch'):
            for _ in tqdm(list(range(train_data.num_batches)), desc='batch'):
                batch = train_data.next_batch()
                image_files, sentences, masks = batch
                images = self.image_loader.load_images(image_files)
                feed_dict = {
                    self.images: images,
                    self.sentences: sentences,
                    self.masks: masks
                }
                # _, summary, global_step = sess.run([self.opt_op,
                #                                     self.summary,
                #                                     self.global_step],
                #                                     feed_dict=feed_dict)
                _, global_step = sess.run([self.opt_op, self.global_step],
                                          feed_dict=feed_dict)
                if (global_step + 1) % config.save_period == 0:
                    self.save()
                #train_writer.add_summary(summary, global_step)
            train_data.reset()

        self.save()
        train_writer.close()
        print("Training complete.")

    def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
        """ Evaluate the model using the COCO val2014 data. """
        print("Evaluating the model ...")
        config = self.config

        results = []
        if not os.path.exists(config.eval_result_dir):
            os.mkdir(config.eval_result_dir)

        # Generate the captions for the images
        idx = 0
        for k in tqdm(list(range(eval_data.num_batches)), desc='batch'):
            #for k in range(1):
            batch = eval_data.next_batch()
            #caption_data = self.beam_search(sess, batch, vocabulary)
            images = self.image_loader.load_images(batch)
            caption_data, scores = sess.run([self.predictions, self.probs],
                                            feed_dict={self.images: images})
            fake_cnt = 0 if k<eval_data.num_batches-1 \
                         else eval_data.fake_count
            for l in range(eval_data.batch_size - fake_cnt):
                ## self.predictions will return the indexes of words, we need to find the corresponding word from it.
                word_idxs = caption_data[l]
                ## get_sentence will return a sentence till there is a end delimiter which is '.'
                caption = str(vocabulary.get_sentence(word_idxs))
                results.append({
                    'image_id': int(eval_data.image_ids[idx]),
                    'caption': caption
                })
                #print(results)
                idx += 1

                # Save the result in an image file, if requested
                if config.save_eval_result_as_image:
                    image_file = batch[l]
                    image_name = image_file.split(os.sep)[-1]
                    image_name = os.path.splitext(image_name)[0]
                    img = mpimg.imread(image_file)
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(caption)
                    plt.savefig(
                        os.path.join(config.eval_result_dir,
                                     image_name + '_result.jpg'))

        fp = open(config.eval_result_file, 'w')
        json.dump(results, fp)
        fp.close()

        # Evaluate these captions
        eval_result_coco = eval_gt_coco.loadRes(config.eval_result_file)
        scorer = COCOEvalCap(eval_gt_coco, eval_result_coco)
        scorer.evaluate()
        print("Evaluation complete.")

    def test(self, sess, test_data, vocabulary):
        """ Test the model using any given images. """
        print("Testing the model ...")
        config = self.config

        if not os.path.exists(config.test_result_dir):
            os.makedirs(config.test_result_dir)

        captions = []
        scores = []

        # Generate the captions for the images
        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
            batch = test_data.next_batch()
            images = self.image_loader.load_images(batch)
            caption_data, scores_data = sess.run(
                [self.predictions, self.probs],
                feed_dict={self.images: images})

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size - fake_cnt):
                ## self.predictions will return the indexes of words, we need to find the corresponding word from it.
                word_idxs = caption_data[l]
                ## get_sentence will return a sentence till there is a end delimiter which is '.'
                caption = vocabulary.get_sentence(word_idxs)
                print(caption)
                captions.append(caption)
                scores.append(scores_data[l])

                # Save the result in an image file
                image_file = batch[l]
                image_name = image_file.split("/")[-1]
                image_name = os.path.splitext(image_name)[0]
                img = mpimg.imread(image_file)
                plt.imshow(img)
                plt.axis('off')
                plt.title(caption)
                plt.savefig(
                    os.path.join(config.test_result_dir,
                                 image_name + '_result.jpg'))

        ##Save the captions to a file
        results = pd.DataFrame({
            'image_files': test_data.image_files,
            'caption': captions,
            'prob': scores
        })
        results.to_csv(config.test_result_file)
        print("Testing complete.")

    def save(self):
        """ Save the model. """
        config = self.config
        data = {v.name: v.eval() for v in tf.global_variables()}
        save_path = os.path.join(config.save_dir, str(self.global_step.eval()))

        print((" Saving the model to %s..." % (save_path + ".npy")))
        if not os.path.exists(os.path.dirname(save_path)):
            try:
                os.makedirs(os.path.dirname(save_path))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        np.save(save_path, data)
        info_file = open(os.path.join(config.save_dir, "config.pickle"), "wb")
        config_ = copy.copy(config)
        config_.global_step = self.global_step.eval()
        pickle.dump(config_, info_file)
        info_file.close()
        print("Model saved.")

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        if model_file is not None:
            save_path = model_file
        else:
            info_path = os.path.join(config.save_dir, "config.pickle")
            info_file = open(info_path, "rb")
            config = pickle.load(info_file)
            global_step = config.global_step
            info_file.close()
            save_path = os.path.join(config.save_dir,
                                     str(global_step) + ".npy")

        print("Loading the model from %s..." % save_path)
        data_dict = np.load(save_path, allow_pickle=True).item()
        count = 0
        for v in tqdm(tf.global_variables()):
            if v.name in data_dict.keys():
                sess.run(v.assign(data_dict[v.name]))
                count += 1
        print("%d tensors loaded." % count)

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        print("All variables present...")
        for var in tf.all_variables():
            print(var)
        with tf.variable_scope('conv1_1', reuse=True):
            kernel = tf.get_variable('conv1_1_W')

        print("Loading the CNN from %s..." % data_path)
        data_dict = np.load(data_path, encoding='latin1')
        count = 0
        for param_name in tqdm(data_dict.keys()):
            op_name = param_name[:-2]
            print(param_name)
            #print(op_name)
            with tf.variable_scope(op_name, reuse=True):
                try:
                    var = tf.get_variable(param_name)
                    session.run(var.assign(data_dict[param_name]))
                    count += 1
                except ValueError:
                    print("No such variable")
                    pass

        print("%d tensors loaded." % count)
예제 #21
0
def get_feats(dir=False):

    train_caption_file = 'D:/download/art_desc/train/ann.csv'
    train_features_dir = 'D:/download/art_desc/train/images_vgg_redo/'

    eval_caption_file = 'D:/download/art_desc/val/ann.csv'
    #eval_image_dir = 'D:/download/art_desc/val/test_images/'
    eval_image_dir = 'D:/download/art_desc/val/images_redo/'

    eval_features_dir = 'D:/download/art_desc/val/images_vgg_redo/'
    #eval_features_dir = 'D:/download/art_desc/val/test_vgg/'
    image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')

    ignore_file = 'D:/download/art_desc/val/ignore.csv'

    net = VGG19(weights='imagenet')
    model = Model(input=net.input, output=net.get_layer('fc2').output)

    bad_ids = []
    prev_id = 0
    prev_bad = False

    if dir:
        with open(eval_caption_file, 'r') as f:  #caption file
            reader = csv.reader(f)
            for id, file_name, caption in reader:
                try:

                    img = image_loader.load_image(file_name)
                    '''
					fc2 = model.predict(img)
					reshaped = np.reshape(fc2, (4096))
					np.save(eval_features_dir + 'art_desc'+ str(id), reshaped)
					'''
                    prev_bad = False
                except Exception:
                    if id != prev_id or prev_bad is False:
                        print("cannot identify image file:" + file_name)
                        bad_ids.append(id)
                    prev_bad = True
                prev_id = id

    else:
        with open(train_caption_file, 'r') as f:  #caption file
            reader = csv.reader(f)
            for id, file_name, caption in reader:
                try:
                    img = image_loader.load_image(file_name)
                    '''
					fc2 = model.predict(img)
					reshaped = np.reshape(fc2, (4096))
					np.save(train_features_dir + 'art_desc'+ id, reshaped) #feature dir
					'''
                    prev_bad = False
                except Exception:
                    if id != prev_id or prev_bad is False:
                        print("cannot identify image file:" + file_name)
                        bad_ids.append(id)
                    prev_bad = True

                prev_id = id

    print("Total bad image files:%d" % len(bad_ids))
    data = pd.DataFrame({'index': bad_ids})
    data.to_csv(ignore_file)
예제 #22
0
class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.train.get_or_create_global_step()
        self.build()

    def build(self):
        raise NotImplementedError()

    def train(self, sess, train_data):
        """ Train the model using the COCO train2014 data. """
        print("Training the model...")
        config = self.config
        run_metadata = None
        run_options = None
        if config.save_timeline:
            run_metadata = tf.RunMetadata()
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

        for _ in tqdm(list(range(config.num_epochs)), desc='epoch'):
            for _ in tqdm(list(range(train_data.num_batches)), desc='batch'):
                batch = train_data.next_batch()
                image_files, sentences, masks = batch
                images = self.image_loader.load_images(image_files)
                feed_dict = {
                    self.images: images,
                    self.sentences: sentences,
                    self.masks: masks
                }
                _, summary, global_step = sess.run(
                    [self.opt_op, self.summary, self.global_step],
                    feed_dict=feed_dict,
                    options=run_options,
                    run_metadata=run_metadata)
                if (global_step + 1) % config.save_period == 0:
                    self.save(sess._tf_sess())

                    if config.save_timeline:
                        tl = timeline.Timeline(run_metadata.step_stats)
                        ctf = tl.generate_chrome_trace_format()
                        with open(
                                os.path.join(config.summary_dir,
                                             'timeline-%d.json' % global_step),
                                'w') as wd:
                            wd.write(ctf)
            train_data.reset()
        self.save(sess._tf_sess())
        print("Training complete.")

    def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
        """ Evaluate the model using the COCO val2014 data. """
        print("Evaluating the model ...")
        config = self.config

        results = []
        if not os.path.exists(config.eval_result_dir):
            os.mkdir(config.eval_result_dir)

        # Generate the captions for the images
        idx = 0
        for k in tqdm(list(range(eval_data.num_batches)), desc='batch'):
            batch = eval_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<eval_data.num_batches-1 \
                         else eval_data.fake_count
            for l in range(eval_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                results.append({
                    'image_id': eval_data.image_ids[idx],
                    'caption': caption
                })
                idx += 1

                # Save the result in an image file, if requested
                if config.save_eval_result_as_image:
                    image_file = batch[l]
                    image_name = image_file.split(os.sep)[-1]
                    image_name = os.path.splitext(image_name)[0]
                    img = plt.imread(image_file)
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(caption)
                    plt.savefig(
                        os.path.join(config.eval_result_dir,
                                     image_name + '_result.jpg'))

        fp = open(config.eval_result_file, 'wb')
        json.dump(results, fp)
        fp.close()

        # Evaluate these captions
        eval_result_coco = eval_gt_coco.loadRes(config.eval_result_file)
        scorer = COCOEvalCap(eval_gt_coco, eval_result_coco)
        scorer.evaluate()
        print("Evaluation complete.")

    def test(self, sess, test_data, vocabulary):
        """ Test the model using any given images. """
        print("Testing the model ...")
        config = self.config

        if not os.path.exists(config.test_result_dir):
            os.mkdir(config.test_result_dir)

        captions = []
        scores = []

        # Generate the captions for the images
        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
            batch = test_data.next_batch()
            caption_data = self.beam_search(sess, batch, vocabulary)

            fake_cnt = 0 if k<test_data.num_batches-1 \
                         else test_data.fake_count
            for l in range(test_data.batch_size - fake_cnt):
                word_idxs = caption_data[l][0].sentence
                score = caption_data[l][0].score
                caption = vocabulary.get_sentence(word_idxs)
                captions.append(caption)
                scores.append(score)

                # Save the result in an image file
                image_file = batch[l]
                image_name = image_file.split(os.sep)[-1]
                image_name = os.path.splitext(image_name)[0]
                img = plt.imread(image_file)
                plt.imshow(img)
                plt.axis('off')
                plt.title(caption)
                plt.savefig(
                    os.path.join(config.test_result_dir,
                                 image_name + '_result.jpg'))

        # Save the captions to a file
        results = pd.DataFrame({
            'image_files': test_data.image_files,
            'caption': captions,
            'prob': scores
        })
        results.to_csv(config.test_result_file)
        print("Testing complete.")

    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results

    def save(self, sess):
        """ Save the model. """
        config = self.config
        save_dir = config.save_dir
        if tf.gfile.Exists(save_dir):
            tf.gfile.DeleteRecursively(save_dir)
        print('target exporting save_dir: %s', save_dir)

        saved_model = tf.saved_model.builder.SavedModelBuilder(save_dir)
        sess.graph._unsafe_unfinalize()
        saved_model.add_meta_graph_and_variables(sess,
                                                 [tf.saved_model.SERVING],
                                                 clear_devices=True)
        saved_model.save()
        print("Model saved.")

    def load(self, sess, model_file=None):
        """ Load the model. """
        config = self.config
        saver = tf.train.Saver()
        if model_file is not None:
            save_path = model_file
        else:
            save_path = config.checkpoint_dir

        ckpt = tf.train.latest_checkpoint(save_path)
        saver.restore(sess, ckpt)

        print("The model from %s loaded" % save_path)

    def load_cnn(self, session, data_path, ignore_missing=True):
        """ Load a pretrained CNN model. """
        print("Loading the CNN from %s..." % data_path)
        data_dict = np.load(data_path).item()
        count = 0
        for op_name in tqdm(data_dict):
            with tf.variable_scope(op_name, reuse=True):
                for param_name, data in data_dict[op_name].iteritems():
                    try:
                        var = tf.get_variable(param_name)
                        session.run(var.assign(data))
                        count += 1
                    except ValueError:
                        pass
        print("%d tensors loaded." % count)