예제 #1
0
    def test_concat_router(self):
        args = set_router_parser().parse_args([
            '--yaml_path', self.concat_router_yaml,
            '--socket_out', str(SocketType.PUSH_BIND)
        ])
        c_args = _set_client_parser().parse_args([
            '--port_in', str(args.port_out),
            '--port_out', str(args.port_in),
            '--socket_in', str(SocketType.PULL_CONNECT)
        ])
        with RouterService(args), ZmqClient(c_args) as c1:
            msg = gnes_pb2.Message()
            msg.request.search.query.chunk_embeddings.CopyFrom(array2blob(np.random.random([5, 2])))
            msg.envelope.num_part.extend([1, 3])
            c1.send_message(msg)
            c1.send_message(msg)
            c1.send_message(msg)
            r = c1.recv_message()
            self.assertSequenceEqual(r.envelope.num_part, [1])
            print(r.envelope.routes)
            self.assertEqual(r.request.search.query.chunk_embeddings.shape, [5, 6])

            for j in range(1, 4):
                d = msg.request.index.docs.add()
                d.chunk_embeddings.CopyFrom(array2blob(np.random.random([5, 2 * j])))

            c1.send_message(msg)
            c1.send_message(msg)
            c1.send_message(msg)
            r = c1.recv_message()
            self.assertSequenceEqual(r.envelope.num_part, [1])
            for j in range(1, 4):
                self.assertEqual(r.request.index.docs[j - 1].chunk_embeddings.shape, [5, 6 * j])
예제 #2
0
    def apply(self, doc: 'gnes_pb2.Document') -> None:
        super().apply(doc)
        if len(doc.chunks) > 0:
            for chunk in doc.chunks:
                images = blob2array(chunk.blob)

                if len(images) == 0:
                    self.logger.warning("this chunk has no frame!")
                elif self.sframes == 1:
                    idx = int(len(images) / 2)
                    frame = np.array(Image.fromarray(images[idx].astype('uint8')).resize((self.target_width, self.target_height)))
                    frame = np.expand_dims(frame, axis=0)
                    # self.logger.info("choose one frame, the shape is: (%d, %d, %d, %d)" % (
                    #     frame.shape[0], frame.shape[1], frame.shape[2], frame.shape[3]
                    # ))
                    chunk.blob.CopyFrom(array2blob(frame))
                elif self.sframes > 0 and len(images) > self.sframes:
                    if len(images) >= 2 * self.sframes:
                        step = math.ceil(len(images) / self.sframes)
                        frames = images[::step]
                    else:
                        idx = np.sort(np.random.choice(len(images), self.sframes, replace=False))
                        frames = images[idx]

                    frames = np.array(
                        [np.array(Image.fromarray(img.astype('uint8')).resize((self.target_width, self.target_height)))
                         for img in frames])
                    chunk.blob.CopyFrom(array2blob(frames))
                del images
        else:
            self.logger.error(
                'bad document: "doc.chunks" is empty!')
예제 #3
0
    def setUp(self) -> None:
        self.doc = gnes_pb2.Document()

        c1 = self.doc.chunks.add()
        c1.blob.CopyFrom(array2blob(np.array([[1, 2, 3], [2, 3, 4]])))

        c2 = self.doc.chunks.add()
        c2.blob.CopyFrom(
            array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3]])))

        c3 = self.doc.chunks.add()
        c3.blob.CopyFrom(
            array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]])))
예제 #4
0
    def apply(self, doc: 'gnes_pb2.Document') -> None:
        super().apply(doc)

        self.logger.info("doc id is: %s" % str(doc.doc_id))
        # self.logger.info("this doc has chunks: %d! for doc %s" % (len(doc.chunks), str(doc.doc_id)))

        total_frames = 0

        for offset, c in enumerate(doc.chunks):
            # self.logger.info("chunk offset is: %s" % str(offset))
            webp_data = webp.WebPData.from_buffer(c.raw)
            # self.logger.info("done transfer buffer! for chunk offset %s" % str(offset))
            dec = webp.WebPAnimDecoder.new(webp_data)
            # self.logger.info("done transfer webp! for chunk offset %s" % str(offset))

            image_list = []
            for arr, timestamp_ms in dec.frames():
                image = np.array(arr)[:, :, :-1].copy()
                image_list.append(image)

            # self.logger.info("done loading all frames! for chunk offset %s" % str(offset))
            c.offset = offset
            image_list_array = np.array(image_list)
            # self.logger.info("done transfer to numpy array! for chunk offset %s" % str(offset))
            c.blob.CopyFrom(array2blob(image_list_array))
            # self.logger.info("done transfer to blob! for chunk offset %s" % str(offset))
            total_frames += len(image_list)
            del dec
            # self.logger.info("done process webp! for chunk offset %s" % str(offset))

        self.logger.info("this doc has frames: %d! for doc %s" %
                         (total_frames, str(doc.doc_id)))

        for c in doc.chunks:
            c.weight /= total_frames
예제 #5
0
 def build_msgs2(self, seed=0):
     all_msgs = []
     num_msg = 20
     random.seed(seed)
     np.random.seed(seed)
     for j in range(num_msg):
         msg = gnes_pb2.Message()
         msg.envelope.client_id = 'abc'
         for _ in range(random.randint(10, 20)):
             d = msg.request.index.docs.add()
             # each doc is about 1MB to 10MB
             for _ in range(random.randint(10, 20)):
                 c = d.chunks.add()
                 c.embedding.CopyFrom(array2blob(np.random.random([10, 20, 30])))
                 c.blob.CopyFrom(array2blob(np.random.random([10, 20, 30])))
         all_msgs.append(msg)
     return all_msgs
예제 #6
0
    def test_videoshot_indexer(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)

        self.indexer.add([0], [doc])
예제 #7
0
 def apply(self, doc: 'gnes_pb2.Document') -> None:
     super().apply(doc)
     if len(doc.chunks) > 0:
         for chunk in doc.chunks:
             images = blob2array(chunk.blob)
             if len(images) == 0:
                 self.logger.warning("this chunk has no frame!")
             elif self.sframes == 1:
                 idx = [int(len(images) / 2)]
                 chunk.blob.CopyFrom(array2blob(images[idx]))
             elif self.sframes > 0 and len(images) > self.sframes:
                 if len(images) >= 2 * self.sframes:
                     step = math.ceil(len(images) / self.sframes)
                     chunk.blob.CopyFrom(array2blob(images[::step]))
                 else:
                     idx = np.sort(
                         np.random.choice(len(images),
                                          self.sframes,
                                          replace=False))
                     chunk.blob.CopyFrom(array2blob(images[idx]))
     else:
         self.logger.error('bad document: "doc.chunks" is empty!')
예제 #8
0
    def test_dump_load(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)

        doc1 = copy.deepcopy(doc)

        self.indexer.dump(self.dump_path)

        indexer = BaseDocIndexer.load(self.dump_path)

        indexer.add([0], [doc1])
예제 #9
0
    def apply(self, doc: 'gnes_pb2.Document') -> None:
        super().apply(doc)

        video_frames = []

        if doc.WhichOneof('raw_data'):
            raw_type = type(getattr(doc, doc.WhichOneof('raw_data')))
            if doc.raw_bytes:
                video_frames = video.capture_frames(input_data=doc.raw_bytes,
                                                    scale=self.frame_size,
                                                    fps=self.frame_rate,
                                                    vframes=self.vframes)
            elif raw_type == gnes_pb2.NdArray:
                video_frames = blob2array(doc.raw_video)
                if self.vframes > 0:
                    video_frames = video_frames[0:self.vframes, :].copy()

            num_frames = len(video_frames)
            if num_frames > 0:
                shots = self.detect_shots(video_frames)
                for ci, frames in enumerate(shots):
                    c = doc.chunks.add()
                    c.doc_id = doc.doc_id
                    c.offset = ci
                    shot_len = len(frames)
                    c.weight = shot_len / num_frames
                    if self.sframes > 0 and shot_len > self.sframes:
                        if shot_len >= 2 * self.sframes:
                            step = math.ceil(shot_len / self.sframes)
                            frames = frames[::step]
                        else:
                            idx = np.sort(
                                np.random.choice(shot_len,
                                                 self.sframes,
                                                 replace=False))
                            frames = [frames[idx_] for idx_ in idx]

                    chunk_data = np.array(frames)
                    c.blob.CopyFrom(array2blob(chunk_data))
            else:
                self.logger.error(
                    'bad document: "raw_bytes" or "raw_video" is empty!')
        else:
            self.logger.error('bad document: "raw_data" is empty!')

        if self.drop_raw_data:
            self.logger.info("document raw data will be cleaned!")
            doc.ClearField('raw_data')
예제 #10
0
    def test_webp_encoder(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        doc.raw_video.CopyFrom(raw_data)
        self.webp_encoder.apply(doc)
        doc1 = copy.deepcopy(doc)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)
        self.webp_encoder.apply(doc)
        doc2 = copy.deepcopy(doc)

        self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
예제 #11
0
    def test_empty_service(self):
        args = set_encoder_parser().parse_args(['--yaml_path', '!TestEncoder {gnes_config: {name: EncoderService, is_trained: true}}'])
        c_args = _set_client_parser().parse_args([
            '--port_in', str(args.port_out),
            '--port_out', str(args.port_in)])

        with ServiceManager(EncoderService, args), ZmqClient(c_args) as client:
            msg = gnes_pb2.Message()
            d = msg.request.index.docs.add()
            d.doc_type = gnes_pb2.Document.IMAGE

            c = d.chunks.add()
            c.blob.CopyFrom(array2blob(self.test_numeric))

            client.send_message(msg)
            r = client.recv_message()
            self.assertEqual(len(r.request.index.docs), 1)
            self.assertEqual(r.response.index.status, gnes_pb2.Response.SUCCESS)
예제 #12
0
    def test_dump_load(self):
        raw_data = array2blob(self.video_frames)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        doc.raw_video.CopyFrom(raw_data)
        self.mp4_encoder.apply(doc)
        doc1 = copy.deepcopy(doc)

        self.mp4_encoder.dump(self.dump_path)

        encoder = BaseVideoPreprocessor.load(self.dump_path)

        doc = gnes_pb2.Document()
        doc.doc_type = gnes_pb2.Document.VIDEO
        chunk = doc.chunks.add()
        chunk.blob.CopyFrom(raw_data)
        encoder.apply(doc)
        doc2 = copy.deepcopy(doc)

        self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
예제 #13
0
    def test_combine_score_fn(self):
        from gnes.indexer.chunk.helper import ListKeyIndexer
        from gnes.indexer.chunk.numpy import NumpyIndexer
        from gnes.proto import array2blob
        import numpy as np

        q_chunk = gnes_pb2.Chunk()
        q_chunk.doc_id = 2
        q_chunk.weight = 0.3
        q_chunk.offset = 0
        q_chunk.embedding.CopyFrom(array2blob(np.array([3, 3, 3])))

        for _fn in [
                WeightedChunkOffsetScoreFn, CoordChunkScoreFn,
                TFIDFChunkScoreFn, BM25ChunkScoreFn
        ]:
            indexer = NumpyIndexer(helper_indexer=ListKeyIndexer(),
                                   score_fn=_fn())
            indexer.add(keys=[(0, 1), (1, 2)],
                        vectors=np.array([[1, 1, 1], [2, 2, 2]]),
                        weights=[0.5, 0.8])
            queried_result = indexer.query_and_score(q_chunks=[q_chunk],
                                                     top_k=2)
예제 #14
0
    def apply(self, doc: 'gnes_pb2.Document') -> None:
        super().apply(doc)

        video_frames = []

        if doc.WhichOneof('raw_data'):
            raw_type = type(getattr(doc, doc.WhichOneof('raw_data')))
            if doc.raw_bytes:
                video_frames = video.capture_frames(input_data=doc.raw_bytes,
                                                    scale=self.frame_size,
                                                    fps=self.frame_rate,
                                                    vframes=self.frame_num)
            elif raw_type == gnes_pb2.NdArray:
                video_frames = blob2array(doc.raw_video)
                if self.frame_num > 0:
                    stepwise = len(video_frames) / self.frame_num
                    video_frames = video_frames[0::stepwise, :]

            num_frames = len(video_frames)
            if num_frames > 0:
                shots = self.detect_shots(video_frames)
                for ci, frames in enumerate(shots):
                    c = doc.chunks.add()
                    c.doc_id = doc.doc_id
                    chunk_data = np.array(frames)
                    c.blob.CopyFrom(array2blob(chunk_data))
                    c.offset = ci
                    c.weight = len(frames) / num_frames
            else:
                self.logger.error(
                    'bad document: "raw_bytes" or "raw_video" is empty!')
        else:
            self.logger.error('bad document: "raw_data" is empty!')

        if self.drop_raw_data:
            self.logger.info("document raw data will be cleaned!")
            doc.ClearField('raw_data')
예제 #15
0
    def test_empty_service(self):
        args = set_indexer_parser().parse_args([
            '--yaml_path',
            '!BaseChunkIndexer {gnes_config: {name: IndexerService}}'
        ])
        c_args = _set_client_parser().parse_args(
            ['--port_in',
             str(args.port_out), '--port_out',
             str(args.port_in)])

        with ServiceManager(IndexerService, args), ZmqClient(c_args) as client:
            msg = gnes_pb2.Message()
            d = msg.request.index.docs.add()

            c = d.chunks.add()
            c.doc_id = 0
            c.embedding.CopyFrom(array2blob(self.test_numeric))
            c.offset = 0
            c.weight = 1.0

            client.send_message(msg)
            r = client.recv_message()
            self.assertEqual(r.response.index.status,
                             gnes_pb2.Response.SUCCESS)
예제 #16
0
 def test_array_proto(self):
     x = np.random.random([5, 4])
     blob = array2blob(x)
     x1 = blob2array(blob)
     assert_array_equal(x, x1)