def __init__(self, checkpoint_path, tokenizer_path, CONFIG): """Load weights of encoder-decoder model from checkpoint. Load saved tokenizer. Args: checkpoint_path (str): path to directory containing checkpoints tokenizer_path (str): path to pickle file storing tokenizer CONFIG (CONFIG object): an object storing the configuration for package """ self.cnn_backbone = model_config_dict[CONFIG.CNN_BACKBONE]['model'] self.cnn_feature_model = self._reconfigure_cnn() self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE) self.decoder = RNN_Decoder(CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE) ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) #chosen_checkpoint = ckpt_manager.checkpoints[2] chosen_checkpoint = ckpt_manager.latest_checkpoint ckpt.restore(chosen_checkpoint) if ckpt_manager.latest_checkpoint: print("******** Restored from {}".format(chosen_checkpoint)) else: print("******** Initializing from scratch.") self.tokens_manager = pickle.load(open(tokenizer_path, 'rb'))
def __init__(self, loss_object, tokenizer, checkpoint_path=None): print('Setting up Evaluation Handler') self.tokenizer = tokenizer self.loss_object = loss_object self.special_tokens = ['<unk>', '<pad>', '<end>', '<start>'] self.checkpoint_path = checkpoint_path if self.checkpoint_path is not None: self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE) self.decoder = RNN_Decoder( CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE) ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder) ckpt_manager = tf.train.CheckpointManager( ckpt, self.checkpoint_path, max_to_keep=5) #chosen_checkpoint = ckpt_manager.checkpoints[2] chosen_checkpoint = ckpt_manager.latest_checkpoint ckpt.restore(chosen_checkpoint)
def load_latest_imgcap(checkpoint_path, ckpt_index=-1): embedding_dim = 256 units = 512 vocab_size = TOP_K + 1 encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) optimizer = tf.keras.optimizers.Adam() ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) ckpt_man = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=None) ckpt.restore(ckpt_man.checkpoints[ckpt_index]) return encoder, decoder
def load_decoder(fname, embedding_dim, units, batch_size=BATCH_SIZE, vocab_size=vocab_size): decoder = RNN_Decoder(embedding_dim, units, vocab_size) input_shape = [(batch_size, 1), (batch_size, attention_features_shape, embedding_dim), (batch_size, units)] decoder.build(input_shape) decoder.load_weights(fname) return decoder
def main(args): # image processing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = CNN_Encoder(args.embed_size).eval() # eval mode(batchnorm uses moving mwan/variance) encoder = encoder.to(device) decoder = RNN_Decoder(args.embed_size,args.hidden_dims,len(vocab),args.num_layers) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image,transform) image_tensor = image.to(device) # Generate a caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence =' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
img_name_vector, cap_vector, test_size=0.1, random_state=0) vocab_size = len(tokenizer.word_index) + 1 print('vocab_size:' + str(vocab_size)) num_steps = len(img_name_train) // BATCH_SIZE val_num_steps = len(img_name_val) // BATCH_SIZE # shape of the vector extracted from InceptionV3 is (64, 2048) # these two variables represent that features_shape = 2048 attention_features_shape = 64 dataset = load_batch(img_name_train, cap_train, BATCH_SIZE, BUFFER_SIZE) val_dataset = load_batch(img_name_val, cap_val, BATCH_SIZE, BUFFER_SIZE) encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) optimizer = tf.train.AdamOptimizer() checkpoint_path = "./checkpoints/train" ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=50) start_epoch = 0 if ckpt_manager.latest_checkpoint: start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # adding this in a separate cell because if you run the training cell # many times, the loss_plot array will be reset
class InstgramCaptioner: def __init__(self, checkpoint_path, tokenizer_path, CONFIG): """Load weights of encoder-decoder model from checkpoint. Load saved tokenizer. Args: checkpoint_path (str): path to directory containing checkpoints tokenizer_path (str): path to pickle file storing tokenizer CONFIG (CONFIG object): an object storing the configuration for package """ self.cnn_backbone = model_config_dict[CONFIG.CNN_BACKBONE]['model'] self.cnn_feature_model = self._reconfigure_cnn() self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE) self.decoder = RNN_Decoder(CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE) ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) #chosen_checkpoint = ckpt_manager.checkpoints[2] chosen_checkpoint = ckpt_manager.latest_checkpoint ckpt.restore(chosen_checkpoint) if ckpt_manager.latest_checkpoint: print("******** Restored from {}".format(chosen_checkpoint)) else: print("******** Initializing from scratch.") self.tokens_manager = pickle.load(open(tokenizer_path, 'rb')) @timer def generate_caption(self, image_path): """Use a CNN-GRU model to predict the caption to an image. Args: image_path (str): the path to the serialized image - png/jpg/jpeg. Returns: result: a list of strings in a sequence representing predicted caption. """ # max_length = 47 on this dataset max_length = self.tokens_manager.max_length print('MAX LENGTH: ', max_length) attention_plot = np.zeros( (max_length, model_config_dict[CONFIG.CNN_BACKBONE]['attention_features_shape'] )) # hidden.shape = [1, 512] # features,shape = [1, 49, 256] # decoder_input.shape = [1, 1] hidden = self.decoder.reset_state(batch_size=1) img = self._load_image(image_path) features = self._create_img_encoding(img) decoder_input = tf.expand_dims( [self.tokens_manager.tokenizer.word_index['<start>']], 0) result = [] for i in range(max_length): # we could use the code below instead to generate randomness in sentence creation - useful for production # but not the testing here: tf.random.categorical(predictions, 1, seed=42)[0][0].numpy() predictions, hidden, attention_weights = self.decoder( decoder_input, features, hidden) attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy() predicted_id = np.argmax(predictions) result.append( self.tokens_manager.tokenizer.index_word[predicted_id]) if self.tokens_manager.tokenizer.index_word[ predicted_id] == '<end>': return result, attention_plot decoder_input = tf.expand_dims([predicted_id], 0) attention_plot = attention_plot[:len(result), :] return result, attention_plot def _create_img_encoding(self, img): """Encode the image using the CNN (e.g. MobileNetV2) and pass through a fully connected layer to embed the image's features. Args: image_path (str): path to the serialized img - png/jpg/jpeg Returns: features: a tensorflow Tensor object of dim [batch_size, cnn_feature_shape, embedding_dim] (e.g. [1, 49, 256]) """ temp_input = tf.expand_dims(img, 0) # this is like saying batch_size = 1 cnn_output = self.cnn_feature_model(temp_input) cnn_output = tf.reshape(cnn_output, (cnn_output.shape[0], -1, cnn_output.shape[3])) features = self.encoder(cnn_output) return features def _reconfigure_cnn(self): """Reconfigures the CNN architecture, removing the final layer (and ImageNet classification layer). Returns: tf.keras.Model: the reconfigured architecture (e.g. MobileNetV2). """ model = self.cnn_backbone(include_top=False, weights='imagenet') new_input = model.input remaining_desired_architecture = model.layers[-1].output reconfigured_cnn = tf.keras.Model(new_input, remaining_desired_architecture) return reconfigured_cnn def _load_image(self, image_path): """load_image function following the convention of keras preprocessing operations for consistency with training code. Args: image_path (str): path to serialized img - png/jpg/jpeg Returns: img: Tensor of image resized to e.g. (224, 224) """ if isinstance(image_path, str): img = tf.io.read_file(image_path) img = tf.image.decode_jpeg(img, channels=3) elif isinstance(image_path, np.ndarray): img = image_path img = tf.image.resize( img, model_config_dict[CONFIG.CNN_BACKBONE]['input_shape']) img = tf.keras.applications.imagenet_utils.preprocess_input(img) return img def _plot_attention(self, img, attention_plot, result): fig, ax = plt.subplots(figsize=(10, 10)) len_result = len(result) for l in range(len_result): temp_att = np.resize(attention_plot[l], (8, 8)) ax = fig.add_subplot(len_result // 2, len_result // 2, l + 1) ax.set_title(result[l]) matplotlib_img = ax.imshow(img) ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=matplotlib_img.get_extent()) plt.tight_layout() plt.show() plt.savefig('attention_plot.png') @timer def test_img_from_mscoco(self, idx, caption_filename_tuple_path, output_file='current_img.png'): """Test the model on an image from the downloaded dataset. This requires the caption_filename_tuple to have been generated and pickled using utils.organise_data(). Example: train_captions, img_name_vector = utils.organise_data() caption_filename_tuple = list(zip(train_captions, img_name_vector)) with open(os.path.join(captions_dir,'caption_filename_tuple.pkl'), 'wb') as pickle_file: pickle.dump(caption_filename_tuple, pickle_file) tokenizer_path = os.path.join(CONFIG.CACHE_DIR_ROOT, f'{CONFIG.CNN_BACKBONE}_captions', 'coco_tokenizer.pkl') checkpoint_path = '/mnt/pythonfiles/models/mobilenet_v2_bahdanau/checkpoints/train/02012021-183517' #model 31122020-180918 shows the best results so far caption_bot = InstgramCaptioner(checkpoint_path, tokenizer_path, CONFIG) caption_filename_tuple_path = os.path.join(CONFIG.CACHE_DIR_ROOT, f'{CONFIG.CNN_BACKBONE}_captions', 'caption_filename_tuple.pkl') idx = int(sys.argv[1]) caption_bot.test_img_from_mscoco(idx, caption_filename_tuple_path) Args: idx (int): index the caption_filename_tuple to select an image for inference. caption_filename_tuple_path (str): path to the caption_filename_tuple. output_file (str, optional): path to output_file location. Defaults to 'current_img.png'. """ caption_filename_tuple = pickle.load( open(caption_filename_tuple_path, 'rb')) current_img_path = caption_filename_tuple[idx][1] # remove <start> and <end> tokens and convert to string ground_truth_caption = ' '.join( caption_filename_tuple[idx][0].split(' ')[1:-1]) # forward pass on the model result, attention_plot = self.generate_caption(current_img_path) gen_caption = ' '.join(result[:-1]) logger.info(f' The caption PREDICTED by caption_bot '.center(80, '*')) logger.info(gen_caption) logger.info(f' The LABELLED ground truth caption '.center(80, '*')) logger.info(ground_truth_caption) # cv2 operations to annotate the image with predicted and ground-truth captions current_img = cv2.imread(current_img_path) cv2.rectangle(current_img, (15, 25), (current_img.shape[1] - 15, 85), (95, 95, 95), cv2.FILLED) cv2.putText(current_img, gen_caption, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (60, 30, 255), 1, cv2.LINE_AA) cv2.putText(current_img, ground_truth_caption, (50, 80), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (20, 240, 10), 1, cv2.LINE_AA) cv2.imwrite(output_file, current_img) self._plot_attention(current_img, attention_plot, result)
CONFIG.BATCH_SIZE) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) val_dataset = val_dataset.shuffle(CONFIG.BUFFER_SIZE).batch( CONFIG.EVAL_BATCH_SIZE) val_dataset = val_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) # ************ Model ************ # mirrored_strategy = tf.distribute.MirroredStrategy() # with mirrored_strategy.scope(): encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE, include_cnn_backbone=CONFIG.INCLUDE_CNN_IN_TRAINING) decoder = RNN_Decoder(CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE) # ************ Optimizer ************ initial_learning_rate = CONFIG.LEARNING_RATE lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=200, decay_rate=0.96, staircase=True) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-7)
class EvaluationHandler: def __init__(self, loss_object, tokenizer, checkpoint_path=None): print('Setting up Evaluation Handler') self.tokenizer = tokenizer self.loss_object = loss_object self.special_tokens = ['<unk>', '<pad>', '<end>', '<start>'] self.checkpoint_path = checkpoint_path if self.checkpoint_path is not None: self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE) self.decoder = RNN_Decoder( CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE) ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder) ckpt_manager = tf.train.CheckpointManager( ckpt, self.checkpoint_path, max_to_keep=5) #chosen_checkpoint = ckpt_manager.checkpoints[2] chosen_checkpoint = ckpt_manager.latest_checkpoint ckpt.restore(chosen_checkpoint) @timer def evaluate_data(self, validation_dataset, val_steps, encoder=None, decoder=None): if self.checkpoint_path is None: assert encoder is not None assert decoder is not None self.encoder = encoder self.decoder = decoder print('Begin evaluation') avg_bleu = np.array([0, 0, 0, 0], dtype=float) avg_rouge = 0.0 for batch_idx, (img_tensor, target) in enumerate(validation_dataset): score = self._evaluate_batch(img_tensor, target) avg_bleu += np.array(score['BLEU'], dtype=float)/float(val_steps) avg_rouge += score['ROUGE']/float(val_steps) avg_bleu = avg_bleu.round(2) avg_rouge = avg_rouge.round(2) avg_scores = {'BLEU':avg_bleu, 'ROUGE': avg_rouge} print('The average BLEU:', avg_scores) return avg_scores def _evaluate_batch(self, img_tensor, target): self.loss, self.total_loss, predicted_ids = self._forward_pass(img_tensor, target) self.loss = self.loss/(int(target.shape[1])) predicted_ids = np.array(predicted_ids).reshape(-1) #TODO: remove 46 hardcoding cleaned_target = self._tokens_to_captions(target, self.special_tokens) cleaned_predicted_tokens = self._tokens_to_captions(predicted_ids, self.special_tokens) ground_truth_captions = {f'{k}':v for (k, v) in enumerate(cleaned_target)} predicted_captions = {f'{k}':v for (k, v) in enumerate(cleaned_predicted_tokens)} score, scores = compute_scores(ground_truth_captions, predicted_captions) score = self._clean_coco_scores_output(score) # for gt, pred in zip(ground_truth_captions.values(), predicted_captions.values()): # score = self.bleu_score(gt, pred, verbose=False) return score @tf.function def _forward_pass(self, img_tensor, target): """Training step as tf.function to allow for gradient updates in tensorflow. Args: img_tensor -- this is output of CNN target -- caption vectors of dim (units, max_length) where units is num GRUs and max_length is size of caption with most tokens """ loss = 0 hidden = self.decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([self.tokenizer.word_index['<start>']] * target.shape[0], 1) features = self.encoder(img_tensor) result_ids = [] for i in range(1, target.shape[1]): predictions, hidden, _ = self.decoder(dec_input, features, hidden) predicted_ids = tf.math.argmax(predictions, axis=1) result_ids.append(predicted_ids) loss += loss_function(target[:, i], predictions, self.loss_object) dec_input = tf.expand_dims(predicted_ids, 1) # take the ith word in target not pred i.e. teacher forcing method total_loss = (loss / int(target.shape[1])) return loss, total_loss, result_ids def _tokens_to_captions(self, tokens_batch, tokens_to_remove): predicted_captions = self.tokenizer.sequences_to_texts(np.array(tokens_batch)) cleaned_captions_batch =[] for caption in predicted_captions: # 47 is max seqence length remember ... (6000 dataset) clean_caption = caption.split(' ')[:47] if '<end>' in clean_caption: clean_caption = [item for i, item in enumerate(clean_caption) if '<end>' in clean_caption[i:]] clean_caption = [item for item in clean_caption if item not in tokens_to_remove] if clean_caption == []: clean_caption = [' '] clean_caption_str = ' '.join(clean_caption) cleaned_captions_batch.append([clean_caption_str]) return cleaned_captions_batch def _clean_coco_scores_output(self, scores_dict): score_names = ['BLEU', 'ROUGE'] cleaned_scores_dict = {} for i, (key, val) in enumerate(scores_dict.items()): cleaned_scores_dict[score_names[i]] = val return cleaned_scores_dict def bleu_score(self, predicted, actual, verbose=False): b1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)) b2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)) b3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)) b4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)) if verbose: print('BLEU-1: %f' % b1) print('BLEU-2: %f' % b2) print('BLEU-3: %f' % b3) print('BLEU-4: %f' % b4) return np.array([round(b1, 5), round(b2, 5), round(b3, 5), round(b4, 5)])
def main(_): # Imagenet 데이터셋에 대해 Pre-train된 Inception v3 모델의 Weight를 불러오고, # Softmax Layer 한칸 앞에서 8x8x2048 형태의 Feature map을 추출하는 hidden layer를 output으로 하는 # image_features_extract_model을 tf.keras.Model을 이용해서 선언합니다. image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet') new_input = image_model.input hidden_layer = image_model.layers[-1].output image_features_extract_model = tf.keras.Model(new_input, hidden_layer) # do_caching flag가 True일 경우 이미지들에 대한 bottleneck caching을 수행합니다. if FLAGS.do_caching == True: cache_bottlenecks(img_name_vector, image_features_extract_model) else: print('Already bottleneck cached !') # 가장 빈도수가 높은 5000개의 단어를 선택해서 Vocabulary set을 만들고, # Vocabulary set에 속하지 않은 단어들은 <unk>로 지정합니다. tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=top_k, oov_token="<unk>", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ') tokenizer.fit_on_texts(train_captions) # 가장 긴 문장보다 작은 문장들은 나머지 부분은 <pad>로 padding합니다. tokenizer.word_index['<pad>'] = 0 tokenizer.index_word[0] = '<pad>' # caption 문장을 띄어쓰기 단위로 split해서 tokenize 합니다. train_seqs = tokenizer.texts_to_sequences(train_captions) # 길이가 짧은 문장들에 대한 padding을 진행합니다. cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post') # attetion weights를 위해서 가장 긴 문장의 길이를 저장합니다. max_length = calc_max_length(train_seqs) # 데이터의 80%를 training 데이터로, 20%를 validation 데이터로 split합니다. img_name_train, img_name_val, cap_train, cap_val = train_test_split( img_name_vector, cap_vector, test_size=0.2, random_state=0) print('train image size:', len(img_name_train), 'train caption size:', len(cap_train)) print('validation image size:', len(img_name_val), 'validation caption size:', len(cap_val)) num_steps = len(img_name_train) // BATCH_SIZE # disk에 caching 해놓은 numpy 파일들을 읽습니다. def map_func(img_name, cap): img_tensor = np.load(img_name.decode('utf-8') + '.npy') return img_tensor, cap dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) # numpy 파일들을 병렬적(parallel)으로 불러옵니다. dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # tf.data API를 이용해서 데이터를 섞고(shuffle) batch 개수(=64)로 묶습니다. dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # encoder와 decoder를 선언합니다. encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) # checkpoint 데이터를 저장할 경로를 지정합니다. checkpoint_path = "./checkpoints/train" ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) start_epoch = 0 if ckpt_manager.latest_checkpoint: start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # checkpoint_path에서 가장 최근의 checkpoint를 restore합니다. ckpt.restore(ckpt_manager.latest_checkpoint) loss_plot = [] # 지정된 epoch 횟수만큼 optimization을 진행합니다. for epoch in range(start_epoch + 1, EPOCHS + 1): start = time.time() total_loss = 0 for (batch, (img_tensor, target)) in enumerate(dataset): batch_loss, t_loss = train_step(img_tensor, target, tokenizer, encoder, decoder) total_loss += t_loss if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format( epoch, batch, batch_loss.numpy() / int(target.shape[1]))) # 추후에 plot을 위해서 epoch별 loss값을 저장합니다. loss_plot.append(total_loss / num_steps) # 5회 반복마다 파라미터값을 저장합니다. if epoch % 5 == 0: ckpt_manager.save(checkpoint_number=epoch) print('Epoch {} Loss {:.6f}'.format(epoch, total_loss / num_steps)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) print('Training Finished !') plt.plot(loss_plot) plt.xlabel('Epochs') plt.ylabel('Loss') plt.title('Loss Plot') plt.savefig('Loss plot.png') plt.show() # validation set에서 random하게 1장의 이미지를 뽑아 해당 이미지에 대한 captioning을 진행합니다. rid = np.random.randint(0, len(img_name_val)) image = img_name_val[rid] real_caption = ' '.join( [tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]]) result, attention_plot = evaluate(image, max_length, attention_features_shape, encoder, decoder, image_features_extract_model, tokenizer) print('Real Caption:', real_caption) print('Prediction Caption:', ' '.join(result)) plot_attention(image, result, attention_plot) # test를 위해서 surfer 이미지 한장을 다운받은뒤, 해당 이미지에 대한 captioning을 진행해봅니다. image_url = 'https://tensorflow.org/images/surf.jpg' image_extension = image_url[-4:] image_path = tf.keras.utils.get_file('image' + image_extension, origin=image_url) result, attention_plot = evaluate(image_path, max_length, attention_features_shape, encoder, decoder, image_features_extract_model, tokenizer) print('Prediction Caption:', ' '.join(result)) plot_attention(image_path, result, attention_plot)
def train(hparams, models_path = './'): """ Returns: results: dict dictionary containing model identifier, elapsed_time per epoch, learning curve with loss and metrics models: tuple of keras Models the trained encoder and decoder networks """ model_id = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") encoder = CNN_Encoder(**hparams['encoder']) decoder = RNN_Decoder(**hparams['decoder'], vocab_size=vocab_size) optimizer = make_optimizer(**hparams['optimizer']) lambda_reg = hparams['train']['lambda_reg'] # ckpt = tf.train.Checkpoint(encoder=encoder, # decoder=decoder, # optimizer = optimizer) # ckpt_manager = tf.train.CheckpointManager(ckpt, CHECKPOINT_PATH, max_to_keep=5) start_epoch = 0 # if ckpt_manager.latest_checkpoint: # start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # # restoring the latest checkpoint in checkpoint_path # ckpt.restore(ckpt_manager.latest_checkpoint) @tf.function def train_step(img_tensor, target): loss = 0 losses = {} batch_size, caption_length = target.shape # initializing the hidden state for each batch # because the captions are not related from image to image hidden = decoder.reset_state(batch_size = batch_size) dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1) # attention_plot = tf.Variable(tf.zeros((batch_size, # caption_length, # attention_features_shape))) with tf.GradientTape() as tape: features = encoder(img_tensor, training = True) attention_sum = tf.zeros((batch_size, attention_features_shape, 1)) for i in range(1, caption_length): # passing the features through the decoder predictions, hidden, attention_weights = decoder((dec_input, features, hidden), training = True) attention_sum += attention_weights # loss += loss_function(target[:, i], predictions) # using teacher forcing dec_input = tf.expand_dims(target[:, i], 1) losses['cross_entropy'] = loss/caption_length # attention regularization loss loss_attn_reg = lambda_reg * tf.reduce_sum((1 - attention_sum)**2) losses['attention_reg'] = loss_attn_reg/caption_length loss += loss_attn_reg # Weight decay losses loss_weight_decay = tf.add_n(encoder.losses) + tf.add_n(decoder.losses) losses['weight_decay'] = loss_weight_decay/caption_length loss += loss_weight_decay losses['total'] = loss/ caption_length trainable_variables = encoder.trainable_variables + decoder.trainable_variables gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss, losses num_steps = num_examples // BATCH_SIZE loss_plots = {'cross_entropy':[], 'attention_reg':[], 'weight_decay':[], 'total':[]} metrics = {'cross-entropy':[], 'bleu-1':[],'bleu-2':[],'bleu-3':[], 'bleu-4':[], 'meteor':[]} epoch_times = [] val_epoch_times = [] start = time.time() logging.info('Training start for model ' + model_id) logging.info('hparams: ' + str(hparams)) for epoch in range(start_epoch, EPOCHS): epoch_start = time.time() total_loss = {'cross_entropy':0, 'attention_reg':0, 'weight_decay':0, 'total':0} for (batch, (img_tensor, target)) in enumerate(dataset_train): batch_loss, t_loss = train_step(img_tensor, target) for key in total_loss.keys(): total_loss[key] += float(t_loss[key]) if batch % 100 == 0: logging.info('Epoch {} Batch {} Loss {:.4f}'.format( epoch + 1, batch, batch_loss.numpy() / int(target.shape[1]))) # storing the epoch end loss value to plot later for key in loss_plots.keys(): loss_plots[key].append(total_loss[key] / num_steps) # Evaluate on validation val_epoch_start = time.time() epoch_scores = validation_scores(dataset_val, (encoder, decoder), tokenizer) val_epoch_stop = time.time() - val_epoch_start val_epoch_times.append(val_epoch_stop) for name, score in epoch_scores.items(): metrics[name].append(score) epoch_stop = time.time() - epoch_start epoch_times.append(epoch_stop) # if epoch % 1 == 0: # ckpt_manager.save() logging.info('Epoch {} Loss {:.6f}'.format(epoch + 1, total_loss['total']/num_steps)) logging.info('Time taken for 1 epoch {} sec\n'.format(epoch_stop)) total_time = time.time() - start logging.info('Total training time: {}'.format(total_time)) results = { 'id':model_id, 'losses':loss_plots, 'epoch_times':epoch_times, 'total_time':total_time, 'encoder_params': encoder.count_params(), 'decoder_params': decoder.count_params(), 'instances_train': num_examples, 'instances_valid': num_examples_val, 'batch_size': BATCH_SIZE, 'epochs': EPOCHS, 'vocabulary': vocab_size, 'valid_batch_size': VALID_BATCH_SIZE, 'valid_epoch_times':val_epoch_times, 'metrics_val': metrics} encoder.save_weights(str(models_path) + ('encoder_' + model_id + '.h5')) decoder.save_weights(str(models_path) + ('decoder_' + model_id + '.h5')) models = (encoder, decoder) return results, models
def main(): # The sample size from the total 4,14,803 is defined here total_size = 400000 img_name_vector, train_captions, = preprocess(total_size) # The CNN encoder model is initialised here image_features_extract_model = image_features_model() # This function initially takes all the images at a batch size of 16 # provides them to input to a CNN model, the activation output from the # convolution layer is stored directly # run this function only once to create, rest of the time, comment this line # enc_len = batch_feature_processing(img_name_vector) #print("Encoded length", enc_len, "len", len(img_name_vector)) # This function takes in the captions, preprocess them and # return a sequemce of numbers which represent sequence of words # part of a vocabulary cap_vector, tokenizer, max_length = proc_caption(train_captions) # Create training and validation sets using an 80-20 split img_name_train, img_name_val, cap_train, cap_val = train_test_split( img_name_vector, cap_vector, test_size=0.2, random_state=0) print(len(img_name_train), len(cap_train), len(img_name_val), len(cap_val), "\n") # Training parameters according to system's configuration top_k = 10000 BATCH_SIZE = 64 BUFFER_SIZE = 1000 embedding_dim = 256 units = 512 # vocabuary of words vocab_size = top_k + 1 num_steps = len(img_name_train) // BATCH_SIZE # Shape of the vector extracted from InceptionV3 is (64, 2048) # These two variables represent that vector shape features_shape = 2048 attention_features_shape = 64 loss_plot = [] # Tensorflow data pipeline, similar to the documented example # Taking tensor slices from both the image name list # and corresponding caption dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) #Use map to load the numpy files in parallel dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) #MODEL STARTS HERE encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) #OPTIMIZER AND LOSSS optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') # Very own loss function according to paper def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) #CHECKPOINTS checkpoint_path = "./checkpoints/train400000" ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10) start_epoch = 0 if ckpt_manager.latest_checkpoint: start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # restoring the latest checkpoint in checkpoint_path print("LATEST CHECKPOINT:", ckpt_manager.latest_checkpoint) ckpt.restore(ckpt_manager.latest_checkpoint) @tf.function # this declaration is important, without this you will see errors def train_step(img_tensor, target): loss = 0 # initializing the hidden state for each batch # because the captions are not related from image to image hidden = decoder.reset_state(batch_size=target.shape[0]) dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1) # The newer tensorflow uses Gradient Tape to perform Back Propogation with tf.GradientTape() as tape: # final image features are generated features = encoder(img_tensor) # target is the sequence of caption word numbers, iterating through its length for i in range(1, target.shape[1]): # passing the features through the decoder # no need to have the attention weights here predictions, hidden, _ = decoder(dec_input, features, hidden) # cumulating the loss from every time step loss += loss_function(target[:, i], predictions) # using teacher forcing dec_input = tf.expand_dims(target[:, i], 1) total_loss = (loss / int(target.shape[1])) # taking all the trainable parameters trainable_variables = encoder.trainable_variables + decoder.trainable_variables # The gradients are calculated in this step and updated gradients = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(gradients, trainable_variables)) return loss, total_loss # The total no of epochs EPOCHS = 30 for epoch in range(start_epoch, EPOCHS): # Each epoch is recorded in time start = time.time() total_loss = 0 # using the prebuilt tensorflow data pipeline dataset and iterating with each batch for (batch, (img_tensor, target)) in enumerate(dataset): # calaculating each batch loss batch_loss, t_loss = train_step(img_tensor, target) total_loss += t_loss if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format( epoch + 1, batch, batch_loss.numpy() / int(target.shape[1]))) # storing the epoch end loss value to plot later loss_plot.append(total_loss / num_steps) # saving the model checkpoints if epoch % 5 == 0: ckpt_manager.save() print('Epoch {} Loss {:.6f}'.format(epoch + 1, total_loss / num_steps)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) plt.plot(loss_plot) plt.xlabel('Epochs') plt.ylabel('Loss') plt.title('Loss Plot') plt.savefig('training100000.jpg') plt.show()
def main(): #Training parameters total_size = 100000 top_k = 5000 BATCH_SIZE = 64 BUFFER_SIZE = 1000 embedding_dim = 256 units = 512 vocab_size = top_k + 1 features_shape = 512 attention_features_shape = 49 # loading the training caption sequences and image name vectors train_captions, img_name_vector = np.load('traincaption_imgname.npy') # taking a subset of these and comverting to list img_name_vector = img_name_vector[:total_size].tolist() train_captions = train_captions[:total_size].tolist() #Enc_len = batch_feature_processing(img_name_vector) # creating an instance of CNN mdel image_features_extract_model = image_features_model() max_length = 51 # processing the captions cap_vector, tokenizer, max_length = proc_caption(train_captions) # Create training and validation sets using an 80-20 split img_name_train, img_name_val, cap_train, cap_val = train_test_split( img_name_vector, cap_vector, test_size=0.2, random_state=0) #MODEL STARTS HERE encoder = CNN_Encoder(embedding_dim) decoder = RNN_Decoder(embedding_dim, units, vocab_size) #OPTIMIZER AND LOSSS optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') #CHECKPOINTS checkpoint_path = "\checkpoints\train100000" ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) start_epoch = 0 if ckpt_manager.latest_checkpoint: start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1]) # restoring the latest checkpoint in checkpoint_path ckpt.restore(ckpt_manager.latest_checkpoint) #EVALUATION def evaluate(image): attention_plot = np.zeros((max_length, attention_features_shape)) # resetting the hidden state of decoder hidden = decoder.reset_state(batch_size=1) temp_input = tf.expand_dims(load_image(image)[0], 0) # extract the image features img_tensor_val = image_features_extract_model(temp_input) img_tensor_val = tf.reshape( img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3])) # passing the image features through an FC and ReLU layer features = encoder(img_tensor_val) dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0) result = [] # running loop for max length for i in range(max_length): # The decoder takes the image features, hidden state and initial input predictions, hidden, attention_weights = decoder( dec_input, features, hidden) # The attention weights are stored every time step to show the change in attention attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy() # The predicted id are important as they are the keys to which the values are words # basically generate a number which corresponds to a number predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy() print("predicted id:", predicted_id) # adding all the words together to make the caption result.append(tokenizer.index_word[predicted_id]) # The loop ends when the model genrates the end token if tokenizer.index_word[predicted_id] == '<end>': return result, attention_plot # reinitialising the decoder input dec_input = tf.expand_dims([predicted_id], 0) attention_plot = attention_plot[:len(result), :] return result, attention_plot # captions on the validation set #ACTUAL EVALUATION print("len of image name val", len(img_name_val)) # Taking a random number in the test dataset rid = np.random.randint(0, len(img_name_val)) # taking the corresponding image from validation set image = img_name_val[rid] imageid = image[-10:-4] imageid = int(imageid) ref = ref_create(imageid) references = [] for i in ref: l = i.split() references.append(l) real_caption = ' '.join( [tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]]) # testing from some random image #image = '/home/cis/Documents/Vijay/guy.jpeg' # BLEU score evaluation using NLTK library print("Reference Sentences:", references, "\n\n") print("Current real caption", real_caption, "\n\n") result, attention_plot = evaluate(image) print('Prediction Caption:', ' '.join(result), "\n\n") print('Cumulative 1-gram BLEU-1: %f' % sentence_bleu(references, result, weights=(1, 0, 0, 0))) print('Cumulative 2-gram BLEU-2: %f' % sentence_bleu(references, result, weights=(0.5, 0.5, 0, 0))) print('Cumulative 3-gram BLEU-3: %f' % sentence_bleu(references, result, weights=(0.33, 0.33, 0.33, 0))) print( 'Cumulative 4-gram BLEU-4: %f' % sentence_bleu(references, result, weights=(0.25, 0.25, 0.25, 0.25)), "\n\n") # PLotting the attention weights along with words plot_attention(image, result, attention_plot)