예제 #1
0
class SubtitleIndex(BasicIndex):
    '''
    For senentence index
    '''
    def __init__(self, dimension, dir_path):
        super(SubtitleIndex, self).__init__(dimension, dir_path)
        self.sentence_feature = None
        self.sentence_model = TF_Sentence(
            osp.join(
                THIS_DIR,
                '../../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73'
            ))

    def index(self, tv_name=None):
        self.database = []

        if tv_name is not None:
            tv_file = '{}/{}_index.pkl'.format(self.dir_path, tv_name)
        else:
            tv_file = glob.glob('{}/*index.pkl'.format(self.dir_path))

        for i in tv_file:
            with open(i, 'rb') as f:
                data = pickle.load(f)
            self.database.extend(data)
        print('Finish loading {} samples'.format(len(self.database)))

        self.sentence_feature = list()
        if osp.isfile('{}/sentence.pkl'.format(self.dir_path)):
            print('The sentence has been encoded')
            with open(self.dir_path + '/sentence.pkl', 'rb') as f:
                self.sentence_feature = pickle.load(f)

        else:
            print('Start to encode sentence')
            for i in self.database:
                sentence_feature = self.sentence_model.encode(i['SUBTITLE'])
                self.sentence_feature.append(sentence_feature)
            self.sentence_feature = np.array(self.sentence_feature)
            with open(self.dir_path + '/sentence.pkl', 'wb') as f:
                pickle.dump(self.sentence_feature, f)

        # TypeError: ndarrays must be of numpy.float32, and not float64.
        self.gpu_index_flat.add(self.sentence_feature.astype(np.float32))
        print('Finish loading {} sentence features'.format(
            self.gpu_index_flat.ntotal))
    temp = cv2.imread('test_sofa2.jpg')
    q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB))
    q_vec = scene_model.extract_vec(q_tensor, True)
    print(q_vec.shape)

    gpu_machine = BasicIndex.init_size(VIDEO_DATA_PATH)
    gpu_machine.index()
    tic = time.time()
    results = gpu_machine.search(q_vec, 10)
    toc = time.time()
    print("It takes " + str((toc - tic)) + " ms")

    sentence_model = TF_Sentence(
        '../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73')
    product_description_vector = sentence_model.encode(
        'Nike sports shirts is very good.')

    sentence_machine = SubtitleIndex(512, VIDEO_DATA_PATH)
    sentence_machine.index()
    tic = time.time()
    results = sentence_machine.search(
        np.array(product_description_vector).astype(np.float32), 30)
    toc = time.time()
    print("It takes " + str((toc - tic)) + " ms")

    print(results)
    # show results
    for i in results:

        frame = cv2.imread(i['IMAGE'])
        cv2.imshow('show_img', frame)
예제 #3
0
class BasicSearch(object):
    '''
    This is the product-content match module.
    '''
    def __init__(self, database):
        '''

        :param database: the directory path
        '''
        try:
            self.database = database
            self.image_machine = BasicIndex.init_size(self.database)
            self.sentence_machine = SubtitleIndex(512, self.database)

            self.image_machine.index()
            print('Finish loading image GPU indexer')

            self.sentence_machine.index()
            print('Finish loading sentence GPU indexer')

        except ValueError:
            print("Please specify the pkl directory!")

        # TODO load scene/imagenet/faster-rcnn model to extract query image feature.
        self.scene_model = scene_visual(
            'resnet50', os.path.join(THIS_DIR,
                                     '../../weights/places365/{}.pth'),
            os.path.join(THIS_DIR, '../../weights/places365/categories.txt'),
            'cuda:0')

        self.sentence_model = TF_Sentence(
            os.path.join(
                THIS_DIR,
                '../../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73'
            ))

    def search(self,
               image_query,
               subtitle_query=None,
               audio_query=None,
               face_query=None,
               topK=30):
        '''

        :param image_query:
        :param subtitle_query:
        :param audio_query:
        :param face_query:
        :return:
        '''

        # search subtitle first.
        if (image_query is not None) and (subtitle_query is not None):
            product_description_vector = self.sentence_model.encode(
                subtitle_query)
            tic = time.time()
            _, idx, _ = self.sentence_machine.search(
                np.array(product_description_vector).astype(np.float32), topK)
            toc = time.time()
            print("Searching subtitle takes " + str((toc - tic)) + " ms")

            temp = cv2.imread(image_query)
            q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB))
            q_vec = self.scene_model.extract_vec(q_tensor, True)
            print(q_vec.shape)

            tic = time.time()
            self.image_machine.re_index(idx)
            results, _, _ = self.image_machine.search(q_vec, topK, 2)
            toc = time.time()
            print("Searching image takes " + str((toc - tic)) + " ms")

        elif subtitle_query is not None:
            product_description_vector = self.sentence_model.encode(
                subtitle_query)
            tic = time.time()
            results, _, _ = self.sentence_machine.search(
                np.array(product_description_vector).astype(np.float32), topK)
            toc = time.time()
            print("Searching subtitle takes " + str((toc - tic)) + " ms")

        elif image_query is not None:
            temp = cv2.imread(image_query)
            q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB))
            q_vec = self.scene_model.extract_vec(q_tensor, True)
            print(q_vec.shape)

            tic = time.time()
            results, _, _ = self.image_machine.search(q_vec, topK)
            toc = time.time()
            print("Searching image takes " + str((toc - tic)) + " ms")
        else:
            results = []

        return results
예제 #4
0
class DatabasePklSearch(BasicSearch):
    '''
    This is for pkl search and then connect to database to do more operations.
    '''
    def __init__(self, database):
        '''

        :param database: the path contains many .pkl file
        '''

        try:
            self.database = database

            self.image_machine = BasicIndex(2048, self.database)
            print('Finish loading scene-image GPU indexer')

            self.sentence_machine = BasicIndex(512, self.database)
            print('Finish loading scene-sentence GPU indexer')

        except ValueError:
            print("Please specify the pkl directory!")

        # TODO load scene/imagenet/faster-rcnn model to extract query image feature.
        self.scene_model = scene_visual(
            'resnet50', os.path.join(THIS_DIR,
                                     '../../weights/places365/{}.pth'),
            os.path.join(THIS_DIR, '../../weights/places365/categories.txt'),
            'cuda:0')

        self.sentence_model = TF_Sentence(
            os.path.join(
                THIS_DIR,
                '../../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73'
            ))

    def search(self,
               image_query,
               subtitle_query=None,
               audio_query=None,
               face_query=None,
               topK=30,
               tv_name=None):
        '''

        :param image_query:
        :param subtitle_query:
        :param audio_query:
        :param face_query:
        :return:
        '''

        if tv_name is not None:
            self.image_machine.index(tv_name=tv_name)
            self.sentence_machine.index(feature='SUBTITLE_FEATURE',
                                        tv_name=tv_name)
        else:
            self.image_machine.index()
            self.sentence_machine.index(feature='SUBTITLE_FEATURE')

        # search subtitle first.
        if (image_query is not None) and (subtitle_query is not None):
            product_description_vector = self.sentence_model.encode(
                subtitle_query)
            tic = time.time()
            _, idx, _ = self.sentence_machine.search(
                np.array(product_description_vector).astype(np.float32), topK)
            toc = time.time()
            print("Searching subtitle takes " + str((toc - tic)) + " ms")

            temp = cv2.imread(image_query)
            q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB))
            q_vec = self.scene_model.extract_vec(q_tensor, True)
            # print(q_vec.shape)

            tic = time.time()
            self.image_machine.re_index(idx)
            results, _, _ = self.image_machine.search(q_vec, topK, 2)
            toc = time.time()
            print("Searching image takes " + str((toc - tic)) + " ms")

        elif subtitle_query is not None:
            product_description_vector = self.sentence_model.encode(
                subtitle_query)
            tic = time.time()
            results, _, _ = self.sentence_machine.search(
                np.array(product_description_vector).astype(np.float32), topK)
            toc = time.time()
            print("Searching subtitle takes " + str((toc - tic)) + " ms")

        elif image_query is not None:
            temp = cv2.imread(image_query)
            q_tensor = Image.fromarray(cv2.cvtColor(temp, cv2.COLOR_BGR2RGB))
            q_vec = self.scene_model.extract_vec(q_tensor, True)
            # print(q_vec.shape)

            tic = time.time()
            results, _, _ = self.image_machine.search(q_vec, topK)
            toc = time.time()
            print("Searching image takes " + str((toc - tic)) + " ms")
        else:
            results = []

        # TODO better reset design.
        self.sentence_machine.reset_GPU()
        print("Reset sentence GPU indexer")
        self.image_machine.reset_GPU()
        print("Reset product image GPU indexer")

        return results

    def add_pkl(self):
        '''
        Every time we finish processing one video, we should add a pkl file
        :return:
        '''
        pass
예제 #5
0
if __name__ == '__main__':
    word = "Elephant"
    sentence = "I am a sentence for which I would like to get its embedding."
    paragraph = (
        "Universal Sentence Encoder embeddings also support short paragraphs. "
        "There is no hard limit on how long the paragraph is. Roughly, the longer "
        "the more 'diluted' the embedding will be.")
    messages = [word, sentence, paragraph]

    sentences = [
        # Smartphones
        "smartphone",
        "I bought a new phone yesterday.",

        # Clothes
        "Nike produces a new sports shirt.",
        "Your new sports shirt looks great.",

        # Food and health
        "Strawberry ice-cream",
        "Eating strawberries is healthy"
    ]

    m = TF_Sentence(
        '../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73')

    for i in messages:
        m.encode(i)

    m.run_and_plot(sentences)
예제 #6
0
class VideoPostDB(object):
    '''
    This class is for video post_processing.
    We insert some key information of processed videos to a small database and index it for inserting product.
    '''

    def __init__(self, db_path, image_model=None, sentence_model=None, audio_model=None):
        '''

        :param db_path: the path of sqlite database
        :param image_model: get only one image feature to represent the scene
        :param sentence_model: get the subtitle feature in this scene shot
        :param audio_model: get one audio feature in this scene shot
        '''

        self.database = db_path

        # for store the processed feature into .pkl file
        self.sample_list = list()

        if image_model is None:
            self.image_model = scene_visual('resnet50', '../weights/places365/{}.pth',
                                            '../weights/places365/categories.txt',
                                            'cuda:0')
        if sentence_model is None:
            self.sentence_model = TF_Sentence('../weights/sentence/96e8f1d3d4d90ce86b2db128249eb8143a91db73')

        if audio_model is None:
            vgg_graph = tf.Graph()
            with vgg_graph.as_default():
                od_graph_def = tf.GraphDef()
                with tf.gfile.GFile('../weights/audioset/vggish_fr.pb', 'rb') as fid:
                    serialized_graph = fid.read()
                    od_graph_def.ParseFromString(serialized_graph)
                    tf.import_graph_def(od_graph_def, name='')
            self.audio_model = AudiosetFeatureExtractor(vgg_graph, '../weights/audioset/vggish_pca_params.npz')

        # detect the shot bound and get the split frame and millisecond
        self.shot_detector = Shot_Detector()

        # create a simple hysia database or connect to the exist database
        if not osp.isfile(self.database):
            statement = (
                'CREATE TABLE %s (tv_name TEXT, start_time TEXT, end_time TEXT, scene_name TEXT, image_path TEXT, image_feature BLOB, subtitle TEXT, subtitle_feature BLOB, audio_path TEXT, audio_feature BLOB, object_json_path TEXT, face_json_path TEXT, insert_product_path TEXT);')
            tables = ['video']
            statements = [statement % table for table in tables]
            self.db = SqliteDatabase(self.database, statements)
        else:
            self.db = SqliteDatabase(self.database)

    def process(self, video_path):
        '''

        :param video_path: a video path which can also help to find the subtitle and audio
        :param features_path: store the features into a pickle file
        :return:
        '''
        # read video and process video, subtitle and audio information
        # save features into the database
        # TODO This part can be optimized during video content analysis stage in the 0.2 version

        video_name = video_path.split('/')[-1]

        splits_frame, splits_ms = self.shot_detector.detect(video_path)
        # print(splits_ms)
        # get middle frame as the key frame in this shot
        # package the split time
        middle_frame = list()
        middle_time = {}

        for i in range(len(splits_ms) - 1):
            temp = math.floor((splits_frame[i] + splits_frame[i + 1]) / 2.0)
            middle_frame.append(temp)
            middle_time[temp] = [splits_ms[i], splits_ms[i + 1]]
        # print(middle_frame)
        # print(middle_time)

        vid = imageio.get_reader(video_path, 'ffmpeg')
        frame_cnt = vid.get_meta_data()["nframes"]

        # get key frame image features, subtitle features and audio features
        try:
            with tqdm.tqdm(total=frame_cnt, unit="frames") as pbar:
                for id, img in enumerate(vid):
                    if id in middle_frame:
                        # TODO Try to classify image_io, cv.read, PIL
                        # get image feature
                        try:
                            scene_name, scene_feature = self.__get_scene_feature(img)
                        except:
                            # print('Can not extract image feature and assume it is none')
                            scene_feature = 'unknown_feature'
                            scene_name = 'unknown_scene'

                        try:
                            subtitle, subtitle_feature = self.__get_subtitle_feature(middle_time[id][0], middle_time[id][1],
                                                                                 video_path)
                        except:
                            # print('Can not extract subtitle feature and assume it is none')
                            subtitle_feature = 'unknown_feature'
                            subtitle = 'unknown_subtitle'

                        try:
                            audio_path, audio_feature = self.__get_audio_feature(middle_time[id][0], middle_time[id][1],
                                                                                 video_path)

                        except:
                            # print('Can not extract audio feature and assume it is none')
                            audio_feature = 'unknown_feature'
                            audio_path = 'unknown_audio_path'

                        # TODO get object json file and face json file

                        sql = "INSERT INTO video (tv_name, start_time, end_time, scene_name, image_feature, subtitle, subtitle_feature, audio_path, audio_feature) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"
                        sample = (
                        video_name, middle_time[id][0], middle_time[id][1], scene_name, scene_feature, subtitle,
                        subtitle_feature, audio_path, audio_feature)

                        self.db.add(sql, sample)

                        # pack
                        self.__pack_samples(sample)

                    pbar.update(1)
        except:
            print('Throw the last frames')

        self.__insert_index(video_path)

    def __get_scene_feature(self, img):
        img_pil = Image.fromarray(img)
        scene_feature = self.image_model.extract_vec(img_pil, True)
        scene_name = self.image_model.detect(img_pil, True)
        # only return top 1 scene name
        return scene_name['scene'][0], scene_feature

    def __get_subtitle_feature(self, start_time, end_time, video_path):
        srt_name = video_path.split("/")[-1].split(".")[0] + ".srt"
        srt_path = osp.join(osp.dirname(osp.abspath(video_path)), 'subtitles', srt_name)
        sentences = extract_srt(start_time, end_time, srt_path)
        if len(sentences) == 0:
            sentences_feature = 'unknown_feature'
            sentences = 'unknown_subtitle'
        else:
            # TODO TEXT support what data types (BLOB only support numpy)
            sentences = " ".join(sentences)
            sentences_feature = self.sentence_model.encode(sentences)

        return sentences, np.array(sentences_feature)

    def __get_audio_feature(self, start_time, end_time, video_path):
        audio_name = video_path.split("/")[-1].split(".")[0] + ".wav"
        audio_path = osp.join(osp.dirname(osp.abspath(video_path)), 'audios', audio_name)
        # command = "ffmpeg -i %s -ab 160k -ac 2 -ar 44100 -vn %s" % (video_path, audio_path)
        # subprocess.call(command, shell=True)

        audio_feature = self.audio_model.extract(audio_path, start_time, end_time)[0]
        return audio_path, audio_feature

    def __get_object_json(self):
        pass

    def __get_frame_path(self):
        pass

    def __get_face_json(self):
        pass

    def __pack_samples(self, sample):
        sample_dict = {'TV_NAME': sample[0],
                       'START_TIME': sample[1],
                       'END_TIME': sample[2],
                       'SCENE': sample[3],
                       'FEATURE': sample[4],
                       'SUBTITLE': sample[5],
                       'SUBTITLE_FEATURE': sample[6],
                       'AUDIO': sample[7],
                       'AUDIO_FEATURE': sample[8]
                       }

        self.sample_list.append(sample_dict)

    def __insert_index(self, video_path):
        pkl_name = video_path.split("/")[-1].split(".")[0] + "_index.pkl"
        pkl_path = osp.join(osp.dirname(osp.abspath(video_path)), 'multi_features', pkl_name)
        with open(pkl_path, 'wb') as f:
            pickle.dump(self.sample_list, f)