class TrainInceptionV3GRU(Train): def __init__(self, config=CONFS, existing_model_path='', train_generator=None, validation_generator=None, language="english", score_model=True): super().__init__(config=config) self.language = language self._generator = Flickr8kGenerator(language=language) self._train_generator = train_generator or self._generator.train_generator( ) self._validation_generator = validation_generator or self._generator.validation_generator( ) self._train_data_amount = len(self._generator.train_captions) self._validation_data_amount = len(self._generator.validation_captions) self._tokenizer = self._generator.cp.tokenizer self.model = ImageCaptionModeler().get_model( self._generator.cp.vocab_size, tokenizer=self._generator.cp.tokenizer) self._score_model = score_model # load previous model if exist if os.path.exists(existing_model_path): self.model.load_weights(existing_model_path) def run(self): callback = Callback('InceptionV3GRU_' + self.model.layers[-2].name, language=self.language) self.model.fit_generator(generator=self._train_generator, steps_per_epoch=self._train_data_amount // self._batch_size, epochs=5, validation_data=self._validation_generator, validation_steps=64, callbacks=callback.callbacks) cur_time = datetime.datetime.now() model_path = 'models/InceptionV3GRU_model_' + str( cur_time ) + '_' + self.language + "_" + self.model.layers[-2].name + '.h5' model_weight_path = 'models/InceptionV3GRU_weight_' + str(cur_time) + '_' + self.language + "_" + \ self.model.layers[-2].name + '.h5' self.model.save(model_path) self.model.save_weights(model_weight_path) self.save_tokenizer('models/tokenizer_' + self.language + '.pickle') # Save model for serving self._serving.save_model(self.model) if self._score_model: bs = BleuScore() bs.get_model_score(weight_path=model_weight_path, language=self.language)
class TrainInceptionV3GRU(Train): def __init__(self, config=CONFS, existing_model_path='', train_generator=None, validation_generator=None): super().__init__(config=config) self._generator = Flickr8kGenerator() self._train_generator = train_generator or self._generator.train_generator( ) self._validation_generator = validation_generator or self._generator.validation_generator( ) self._train_data_amount = len(self._generator.train_captions) self._validation_data_amount = len(self._generator.validation_captions) self._tokenizer = self._generator.cp.tokenizer self.model = ImageCaptionModeler().get_model( self._generator.cp.vocab_size) # load previous model if exist if os.path.exists(existing_model_path): self.model.load_weights(existing_model_path) def run(self): callback = Callback('InceptionV3GRU') self.model.fit_generator(generator=self._train_generator, steps_per_epoch=self._train_data_amount // self._batch_size, epochs=10, validation_data=self._validation_generator, validation_steps=16, callbacks=callback.callbacks) cur_time = datetime.datetime.now() self.model.save('models/InceptionV3GRU_model.h5' + str(cur_time)) self.model.save_weights('models/InceptionV3GRU_weight.h5' + str(cur_time)) self.save_tokenizer('models/tokenizer.pickle' + str(cur_time)) # Save model for serving self._serving.save_model(self.model)
class InceptionV3GRUPredict(Predict): def __init__(self, weight_path=None, tokenizer_path=None, language='english'): super().__init__() self.tokenizer = self._load_tokenizer(language=language, path=tokenizer_path) self.weight_path = weight_path self.model = None self.load_model() self.start_token = self.tokenizer.word_index['<start>'] self.end_token = self.tokenizer.word_index['<end>'] def load_model(self): self.model = ImageCaptionModeler().get_model(len( self.tokenizer.word_index), tokenizer=self.tokenizer) self.model.load_weights(self.weight_path) def predict_batch(self, images_path, max_words=30, save_prediction_to_file=True, save_mode='caption'): captions, tokens = [], [] for image_path in images_path: caption, token = self.predict(image_path, max_words=max_words) captions.append(caption) tokens.append(token) if save_prediction_to_file: if save_mode == 'caption': with open('prediction.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(zip(images_path, tokens)) elif save_mode == 'token': with open('prediction.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(zip(images_path, tokens)) return captions, tokens # def predict_on_serving(self, image_path, max_words=30, show_image=False): # tokenizer = InceptionV3GRUPredict().tokenizer # try: # image = convert_image_to_numpy_array(image_path) # except AttributeError: # image = image_path # shape = (1, max_words) # decoder_input = np.zeros(shape=shape, dtype=np.int) # word_dict = dict(map(reversed, tokenizer.word_index.items())) # token_int = tokenizer.word_index['start'] # predicted_caption = "" # count_tokens = 0 # pred_word = '' # pred_token = [] # stub, request = get_stub_and_request() # while pred_word != '<end>' and count_tokens < max_words: # predicted_caption += " " + pred_word # decoder_input[0, count_tokens] = token_int # x_data = {'image': np.expand_dims(image, axis=0), # 'sequence_input': decoder_input} # for k, v in x_data.items(): # request.inputs[k].CopyFrom( # tf.contrib.util.make_tensor_proto( # v, # shape=v.shape, # dtype=tf.float32 # )) # result = stub.Predict(request, 5) # array_result = np.array(result.outputs['sequence_output'].float_val) # token_one_hot = np.reshape(array_result, (1, max_words, len(tokenizer.word_index) + 1)) # token_int = np.argmax(token_one_hot[0, count_tokens, :]) # pred_token.append(token_int) # count_tokens += 1 # pred_word = word_dict[token_int] # if show_image: # plt.imshow(Image.open(image_path)) # plt.title(predicted_caption) # plt.show() # return predicted_caption def predict(self, image_path, max_words=40, show_image=False, save_result=False): try: image_arr = convert_image_to_numpy_array(image_path) except AttributeError: image_arr = image_path image_batch = np.expand_dims(image_arr, axis=0) shape = (1, max_words) decoder_input = np.zeros(shape=shape, dtype=np.int) token_int = self.start_token predicted_caption = "" word_dict = dict(map(reversed, self.tokenizer.word_index.items())) count_tokens = 0 pred_word = '' pred_token = [] while pred_word != '<end>' and count_tokens < max_words: decoder_input[0, count_tokens] = token_int x_data = {'input_1': image_batch, 'decoder_input': decoder_input} decoder_output = self.model.predict(x_data) token_one_hot = decoder_output[0, count_tokens, :] token_int = np.argmax(token_one_hot) pred_token.append(token_int) pred_word = word_dict[token_int] predicted_caption += pred_word + " " count_tokens += 1 predicted_caption = predicted_caption.replace(" <end>", "") if show_image: plt.imshow(Image.open(image_path)) plt.title(predicted_caption) plt.show() if save_result: plt.imshow(Image.open(image_path)) plt.title(predicted_caption) image_name = image_path.split('//')[-1] plt.savefig('test/' + image_name) return predicted_caption, pred_token def _load_tokenizer(self, language, path=None): tokenizer_path = path or 'models/tokenizer.pickle' if language == 'indonesia': tokenizer_path = path or 'models/tokenizer_indonesia.pickle' print(path) with open(tokenizer_path, 'rb') as handle: return pickle.load(handle) def beam_search_predictions(self, image_path, max_words=30, beam_index=3): image_arr = convert_image_to_numpy_array(image_path) start = [self.tokenizer.word_index["<start>"]] start_word = [[start, 0.0]] while len(start_word[0][0]) < max_words: temp = [] for s in start_word: par_caps = sequence.pad_sequences([s[0]], maxlen=max_words, padding='post') print(par_caps) # e = encoding_test[image[len(images):]] preds = self.model.predict([np.array([image_arr]), par_caps]) word_preds = np.argsort(preds[0])[-beam_index:] # Getting the top <beam_index>(n) predictions and creating a # new list so as to put them via the model again for w in word_preds: next_cap, prob = s[0][:], s[1] print(next_cap, prob) next_cap.append(w) # print(preds) print(preds.shape) prob += preds[0, 0, w] temp.append([next_cap, prob]) start_word = temp # Sorting according to the probabilities start_word = np.sort(start_word) # Getting the top words start_word = start_word[-beam_index:] start_word = start_word[-1][0] intermediate_caption = [ self.tokenizer.word_index[i] for i in start_word ] final_caption = [] for i in intermediate_caption: if i != '<end>': final_caption.append(i) else: break final_caption = ' '.join(final_caption[1:]) return final_caption def beam_search_decoder(self, image_path, k): image_arr = convert_image_to_numpy_array(image_path) tokens = [self.tokenizer.word_index['<start>']] sequences = [[list(), 1.0]] # walk over each step in sequence while len(tokens) < 30: for row in sequences: all_candidates = list() # expand each current candidate for i in range(len(sequences)): seq, score = sequences[i] for j in range(len(row)): candidate = [seq + [j], score * -log(row[j])] all_candidates.append(candidate) # order all candidates by score ordered = sorted(all_candidates, key=lambda tup: tup[1]) # select k best sequences = ordered[:k] tokens.append(sequences) return tokens