class SubtitleIndex(BasicIndex): ''' For senentence index ''' def __init__(self, dimension, dir_path): super(SubtitleIndex, self).__init__(dimension, dir_path) self.sentence_feature = None self.sentence_model = TF_Sentence( osp.join( THIS_DIR, '../../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73' )) def index(self, tv_name=None): self.database = [] if tv_name is not None: tv_file = '{}/{}_index.pkl'.format(self.dir_path, tv_name) else: tv_file = glob.glob('{}/*index.pkl'.format(self.dir_path)) for i in tv_file: with open(i, 'rb') as f: data = pickle.load(f) self.database.extend(data) print('Finish loading {} samples'.format(len(self.database))) self.sentence_feature = list() if osp.isfile('{}/sentence.pkl'.format(self.dir_path)): print('The sentence has been encoded') with open(self.dir_path + '/sentence.pkl', 'rb') as f: self.sentence_feature = pickle.load(f) else: print('Start to encode sentence') for i in self.database: sentence_feature = self.sentence_model.encode(i['SUBTITLE']) self.sentence_feature.append(sentence_feature) self.sentence_feature = np.array(self.sentence_feature) with open(self.dir_path + '/sentence.pkl', 'wb') as f: pickle.dump(self.sentence_feature, f) # TypeError: ndarrays must be of numpy.float32, and not float64. self.gpu_index_flat.add(self.sentence_feature.astype(np.float32)) print('Finish loading {} sentence features'.format( self.gpu_index_flat.ntotal))
temp = cv2.imread('test_sofa2.jpg') q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)) q_vec = scene_model.extract_vec(q_tensor, True) print(q_vec.shape) gpu_machine = BasicIndex.init_size(VIDEO_DATA_PATH) gpu_machine.index() tic = time.time() results = gpu_machine.search(q_vec, 10) toc = time.time() print("It takes " + str((toc - tic)) + " ms") sentence_model = TF_Sentence( '../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73') product_description_vector = sentence_model.encode( 'Nike sports shirts is very good.') sentence_machine = SubtitleIndex(512, VIDEO_DATA_PATH) sentence_machine.index() tic = time.time() results = sentence_machine.search( np.array(product_description_vector).astype(np.float32), 30) toc = time.time() print("It takes " + str((toc - tic)) + " ms") print(results) # show results for i in results: frame = cv2.imread(i['IMAGE']) cv2.imshow('show_img', frame)
class BasicSearch(object): ''' This is the product-content match module. ''' def __init__(self, database): ''' :param database: the directory path ''' try: self.database = database self.image_machine = BasicIndex.init_size(self.database) self.sentence_machine = SubtitleIndex(512, self.database) self.image_machine.index() print('Finish loading image GPU indexer') self.sentence_machine.index() print('Finish loading sentence GPU indexer') except ValueError: print("Please specify the pkl directory!") # TODO load scene/imagenet/faster-rcnn model to extract query image feature. self.scene_model = scene_visual( 'resnet50', os.path.join(THIS_DIR, '../../weights/places365/{}.pth'), os.path.join(THIS_DIR, '../../weights/places365/categories.txt'), 'cuda:0') self.sentence_model = TF_Sentence( os.path.join( THIS_DIR, '../../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73' )) def search(self, image_query, subtitle_query=None, audio_query=None, face_query=None, topK=30): ''' :param image_query: :param subtitle_query: :param audio_query: :param face_query: :return: ''' # search subtitle first. if (image_query is not None) and (subtitle_query is not None): product_description_vector = self.sentence_model.encode( subtitle_query) tic = time.time() _, idx, _ = self.sentence_machine.search( np.array(product_description_vector).astype(np.float32), topK) toc = time.time() print("Searching subtitle takes " + str((toc - tic)) + " ms") temp = cv2.imread(image_query) q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)) q_vec = self.scene_model.extract_vec(q_tensor, True) print(q_vec.shape) tic = time.time() self.image_machine.re_index(idx) results, _, _ = self.image_machine.search(q_vec, topK, 2) toc = time.time() print("Searching image takes " + str((toc - tic)) + " ms") elif subtitle_query is not None: product_description_vector = self.sentence_model.encode( subtitle_query) tic = time.time() results, _, _ = self.sentence_machine.search( np.array(product_description_vector).astype(np.float32), topK) toc = time.time() print("Searching subtitle takes " + str((toc - tic)) + " ms") elif image_query is not None: temp = cv2.imread(image_query) q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)) q_vec = self.scene_model.extract_vec(q_tensor, True) print(q_vec.shape) tic = time.time() results, _, _ = self.image_machine.search(q_vec, topK) toc = time.time() print("Searching image takes " + str((toc - tic)) + " ms") else: results = [] return results
class DatabasePklSearch(BasicSearch): ''' This is for pkl search and then connect to database to do more operations. ''' def __init__(self, database): ''' :param database: the path contains many .pkl file ''' try: self.database = database self.image_machine = BasicIndex(2048, self.database) print('Finish loading scene-image GPU indexer') self.sentence_machine = BasicIndex(512, self.database) print('Finish loading scene-sentence GPU indexer') except ValueError: print("Please specify the pkl directory!") # TODO load scene/imagenet/faster-rcnn model to extract query image feature. self.scene_model = scene_visual( 'resnet50', os.path.join(THIS_DIR, '../../weights/places365/{}.pth'), os.path.join(THIS_DIR, '../../weights/places365/categories.txt'), 'cuda:0') self.sentence_model = TF_Sentence( os.path.join( THIS_DIR, '../../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73' )) def search(self, image_query, subtitle_query=None, audio_query=None, face_query=None, topK=30, tv_name=None): ''' :param image_query: :param subtitle_query: :param audio_query: :param face_query: :return: ''' if tv_name is not None: self.image_machine.index(tv_name=tv_name) self.sentence_machine.index(feature='SUBTITLE_FEATURE', tv_name=tv_name) else: self.image_machine.index() self.sentence_machine.index(feature='SUBTITLE_FEATURE') # search subtitle first. if (image_query is not None) and (subtitle_query is not None): product_description_vector = self.sentence_model.encode( subtitle_query) tic = time.time() _, idx, _ = self.sentence_machine.search( np.array(product_description_vector).astype(np.float32), topK) toc = time.time() print("Searching subtitle takes " + str((toc - tic)) + " ms") temp = cv2.imread(image_query) q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)) q_vec = self.scene_model.extract_vec(q_tensor, True) # print(q_vec.shape) tic = time.time() self.image_machine.re_index(idx) results, _, _ = self.image_machine.search(q_vec, topK, 2) toc = time.time() print("Searching image takes " + str((toc - tic)) + " ms") elif subtitle_query is not None: product_description_vector = self.sentence_model.encode( subtitle_query) tic = time.time() results, _, _ = self.sentence_machine.search( np.array(product_description_vector).astype(np.float32), topK) toc = time.time() print("Searching subtitle takes " + str((toc - tic)) + " ms") elif image_query is not None: temp = cv2.imread(image_query) q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)) q_vec = self.scene_model.extract_vec(q_tensor, True) # print(q_vec.shape) tic = time.time() results, _, _ = self.image_machine.search(q_vec, topK) toc = time.time() print("Searching image takes " + str((toc - tic)) + " ms") else: results = [] # TODO better reset design. self.sentence_machine.reset_GPU() print("Reset sentence GPU indexer") self.image_machine.reset_GPU() print("Reset product image GPU indexer") return results def add_pkl(self): ''' Every time we finish processing one video, we should add a pkl file :return: ''' pass
if __name__ == '__main__': word = "Elephant" sentence = "I am a sentence for which I would like to get its embedding." paragraph = ( "Universal Sentence Encoder embeddings also support short paragraphs. " "There is no hard limit on how long the paragraph is. Roughly, the longer " "the more 'diluted' the embedding will be.") messages = [word, sentence, paragraph] sentences = [ # Smartphones "smartphone", "I bought a new phone yesterday.", # Clothes "Nike produces a new sports shirt.", "Your new sports shirt looks great.", # Food and health "Strawberry ice-cream", "Eating strawberries is healthy" ] m = TF_Sentence( '../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73') for i in messages: m.encode(i) m.run_and_plot(sentences)
class VideoPostDB(object): ''' This class is for video post_processing. We insert some key information of processed videos to a small database and index it for inserting product. ''' def __init__(self, db_path, image_model=None, sentence_model=None, audio_model=None): ''' :param db_path: the path of sqlite database :param image_model: get only one image feature to represent the scene :param sentence_model: get the subtitle feature in this scene shot :param audio_model: get one audio feature in this scene shot ''' self.database = db_path # for store the processed feature into .pkl file self.sample_list = list() if image_model is None: self.image_model = scene_visual('resnet50', '../weights/places365/{}.pth', '../weights/places365/categories.txt', 'cuda:0') if sentence_model is None: self.sentence_model = TF_Sentence('../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73') if audio_model is None: vgg_graph = tf.Graph() with vgg_graph.as_default(): od_graph_def = tf.GraphDef() with tf.gfile.GFile('../weights/audioset/vggish_fr.pb', 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(od_graph_def, name='') self.audio_model = AudiosetFeatureExtractor(vgg_graph, '../weights/audioset/vggish_pca_params.npz') # detect the shot bound and get the split frame and millisecond self.shot_detector = Shot_Detector() # create a simple hysia database or connect to the exist database if not osp.isfile(self.database): statement = ( 'CREATE TABLE %s (tv_name TEXT, start_time TEXT, end_time TEXT, scene_name TEXT, image_path TEXT, image_feature BLOB, subtitle TEXT, subtitle_feature BLOB, audio_path TEXT, audio_feature BLOB, object_json_path TEXT, face_json_path TEXT, insert_product_path TEXT);') tables = ['video'] statements = [statement % table for table in tables] self.db = SqliteDatabase(self.database, statements) else: self.db = SqliteDatabase(self.database) def process(self, video_path): ''' :param video_path: a video path which can also help to find the subtitle and audio :param features_path: store the features into a pickle file :return: ''' # read video and process video, subtitle and audio information # save features into the database # TODO This part can be optimized during video content analysis stage in the 0.2 version video_name = video_path.split('/')[-1] splits_frame, splits_ms = self.shot_detector.detect(video_path) # print(splits_ms) # get middle frame as the key frame in this shot # package the split time middle_frame = list() middle_time = {} for i in range(len(splits_ms) - 1): temp = math.floor((splits_frame[i] + splits_frame[i + 1]) / 2.0) middle_frame.append(temp) middle_time[temp] = [splits_ms[i], splits_ms[i + 1]] # print(middle_frame) # print(middle_time) vid = imageio.get_reader(video_path, 'ffmpeg') frame_cnt = vid.get_meta_data()["nframes"] # get key frame image features, subtitle features and audio features try: with tqdm.tqdm(total=frame_cnt, unit="frames") as pbar: for id, img in enumerate(vid): if id in middle_frame: # TODO Try to classify image_io, cv.read, PIL # get image feature try: scene_name, scene_feature = self.__get_scene_feature(img) except: # print('Can not extract image feature and assume it is none') scene_feature = 'unknown_feature' scene_name = 'unknown_scene' try: subtitle, subtitle_feature = self.__get_subtitle_feature(middle_time[id][0], middle_time[id][1], video_path) except: # print('Can not extract subtitle feature and assume it is none') subtitle_feature = 'unknown_feature' subtitle = 'unknown_subtitle' try: audio_path, audio_feature = self.__get_audio_feature(middle_time[id][0], middle_time[id][1], video_path) except: # print('Can not extract audio feature and assume it is none') audio_feature = 'unknown_feature' audio_path = 'unknown_audio_path' # TODO get object json file and face json file sql = "INSERT INTO video (tv_name, start_time, end_time, scene_name, image_feature, subtitle, subtitle_feature, audio_path, audio_feature) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)" sample = ( video_name, middle_time[id][0], middle_time[id][1], scene_name, scene_feature, subtitle, subtitle_feature, audio_path, audio_feature) self.db.add(sql, sample) # pack self.__pack_samples(sample) pbar.update(1) except: print('Throw the last frames') self.__insert_index(video_path) def __get_scene_feature(self, img): img_pil = Image.fromarray(img) scene_feature = self.image_model.extract_vec(img_pil, True) scene_name = self.image_model.detect(img_pil, True) # only return top 1 scene name return scene_name['scene'][0], scene_feature def __get_subtitle_feature(self, start_time, end_time, video_path): srt_name = video_path.split("/")[-1].split(".")[0] + ".srt" srt_path = osp.join(osp.dirname(osp.abspath(video_path)), 'subtitles', srt_name) sentences = extract_srt(start_time, end_time, srt_path) if len(sentences) == 0: sentences_feature = 'unknown_feature' sentences = 'unknown_subtitle' else: # TODO TEXT support what data types (BLOB only support numpy) sentences = " ".join(sentences) sentences_feature = self.sentence_model.encode(sentences) return sentences, np.array(sentences_feature) def __get_audio_feature(self, start_time, end_time, video_path): audio_name = video_path.split("/")[-1].split(".")[0] + ".wav" audio_path = osp.join(osp.dirname(osp.abspath(video_path)), 'audios', audio_name) # command = "ffmpeg -i %s -ab 160k -ac 2 -ar 44100 -vn %s" % (video_path, audio_path) # subprocess.call(command, shell=True) audio_feature = self.audio_model.extract(audio_path, start_time, end_time)[0] return audio_path, audio_feature def __get_object_json(self): pass def __get_frame_path(self): pass def __get_face_json(self): pass def __pack_samples(self, sample): sample_dict = {'TV_NAME': sample[0], 'START_TIME': sample[1], 'END_TIME': sample[2], 'SCENE': sample[3], 'FEATURE': sample[4], 'SUBTITLE': sample[5], 'SUBTITLE_FEATURE': sample[6], 'AUDIO': sample[7], 'AUDIO_FEATURE': sample[8] } self.sample_list.append(sample_dict) def __insert_index(self, video_path): pkl_name = video_path.split("/")[-1].split(".")[0] + "_index.pkl" pkl_path = osp.join(osp.dirname(osp.abspath(video_path)), 'multi_features', pkl_name) with open(pkl_path, 'wb') as f: pickle.dump(self.sample_list, f)