class ImageCaptionModel(object): def __init__(self, w_path, dec_path='dataset/text/dec_map.pkl', enc_path='dataset/text/enc_map.pkl', embedding_path='pre_trained/glove.6B.100d.txt'): dec_map = cPickle.load(open(dec_path, 'rb')) enc_map = cPickle.load(open(enc_path, 'rb')) embedding_matrix = generate_embedding_matrix(embedding_path, dec_map) self.model = image_caption_model(embedding_matrix=embedding_matrix) self.extractor = ImageFeatureExtractor('weights/tensorflow_inception_graph.pb') self.model.load_weights(w_path) self.dec_map = dec_map self.enc_map = enc_map def predict(self, img_path): img_feature = self.extractor.extract_features(img_path, flag_from_file=True) #print(img_feature) sentence = generate_k_best(self.model, self.enc_map, self.dec_map, img_feature, k=6, max_len=15) return sentence
if __name__ == '__main__': max_sent_len = 28 model_path = './weights/v1.0.0_11_0_1494239663.5093253_602.h5' image_path = sys.argv[1] ife = ImageFeatureExtractor('model/inception_v3_2016_08_28_frozen.pb') with open('./train/word2idx.pkl', 'rb') as f: word2idx = pickle.load(f) with open('./train/idx2word.pkl', 'rb') as f: idx2word = pickle.load(f) vocab_size = len(word2idx) + 1 model = image_caption_model(vocab_size=vocab_size) model.load_weights(model_path) start_sign = word2idx['+'] img = np.array([ife.extract_features(image_path)]) cur, vhist, answer = np.array([[start_sign]]), np.array([[0] * vocab_size ]), [] vhist = np.array(vhist) for idx in range(0, max_sent_len): seq = np.array([[1 if i == idx else 0 for i in range(0, max_sent_len)]]) out = model.predict([img, cur, seq, vhist])[0] nxt = int(np.argmax(out)) ans = idx2word.get(nxt, '<?>') print(ans, 'score:', out[nxt]) cur = np.array([[nxt]]) tmp_vhist = np.array([[0] * vocab_size]) tmp_vhist[0, nxt] = 1 vhist = np.array([np.logical_or(vhist[0, :], tmp_vhist[0, :])])