def test_concat_router(self): args = set_router_parser().parse_args([ '--yaml_path', self.concat_router_yaml, '--socket_out', str(SocketType.PUSH_BIND) ]) c_args = _set_client_parser().parse_args([ '--port_in', str(args.port_out), '--port_out', str(args.port_in), '--socket_in', str(SocketType.PULL_CONNECT) ]) with RouterService(args), ZmqClient(c_args) as c1: msg = gnes_pb2.Message() msg.request.search.query.chunk_embeddings.CopyFrom(array2blob(np.random.random([5, 2]))) msg.envelope.num_part.extend([1, 3]) c1.send_message(msg) c1.send_message(msg) c1.send_message(msg) r = c1.recv_message() self.assertSequenceEqual(r.envelope.num_part, [1]) print(r.envelope.routes) self.assertEqual(r.request.search.query.chunk_embeddings.shape, [5, 6]) for j in range(1, 4): d = msg.request.index.docs.add() d.chunk_embeddings.CopyFrom(array2blob(np.random.random([5, 2 * j]))) c1.send_message(msg) c1.send_message(msg) c1.send_message(msg) r = c1.recv_message() self.assertSequenceEqual(r.envelope.num_part, [1]) for j in range(1, 4): self.assertEqual(r.request.index.docs[j - 1].chunk_embeddings.shape, [5, 6 * j])
def apply(self, doc: 'gnes_pb2.Document') -> None: super().apply(doc) if len(doc.chunks) > 0: for chunk in doc.chunks: images = blob2array(chunk.blob) if len(images) == 0: self.logger.warning("this chunk has no frame!") elif self.sframes == 1: idx = int(len(images) / 2) frame = np.array(Image.fromarray(images[idx].astype('uint8')).resize((self.target_width, self.target_height))) frame = np.expand_dims(frame, axis=0) # self.logger.info("choose one frame, the shape is: (%d, %d, %d, %d)" % ( # frame.shape[0], frame.shape[1], frame.shape[2], frame.shape[3] # )) chunk.blob.CopyFrom(array2blob(frame)) elif self.sframes > 0 and len(images) > self.sframes: if len(images) >= 2 * self.sframes: step = math.ceil(len(images) / self.sframes) frames = images[::step] else: idx = np.sort(np.random.choice(len(images), self.sframes, replace=False)) frames = images[idx] frames = np.array( [np.array(Image.fromarray(img.astype('uint8')).resize((self.target_width, self.target_height))) for img in frames]) chunk.blob.CopyFrom(array2blob(frames)) del images else: self.logger.error( 'bad document: "doc.chunks" is empty!')
def setUp(self) -> None: self.doc = gnes_pb2.Document() c1 = self.doc.chunks.add() c1.blob.CopyFrom(array2blob(np.array([[1, 2, 3], [2, 3, 4]]))) c2 = self.doc.chunks.add() c2.blob.CopyFrom( array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3]]))) c3 = self.doc.chunks.add() c3.blob.CopyFrom( array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]])))
def apply(self, doc: 'gnes_pb2.Document') -> None: super().apply(doc) self.logger.info("doc id is: %s" % str(doc.doc_id)) # self.logger.info("this doc has chunks: %d! for doc %s" % (len(doc.chunks), str(doc.doc_id))) total_frames = 0 for offset, c in enumerate(doc.chunks): # self.logger.info("chunk offset is: %s" % str(offset)) webp_data = webp.WebPData.from_buffer(c.raw) # self.logger.info("done transfer buffer! for chunk offset %s" % str(offset)) dec = webp.WebPAnimDecoder.new(webp_data) # self.logger.info("done transfer webp! for chunk offset %s" % str(offset)) image_list = [] for arr, timestamp_ms in dec.frames(): image = np.array(arr)[:, :, :-1].copy() image_list.append(image) # self.logger.info("done loading all frames! for chunk offset %s" % str(offset)) c.offset = offset image_list_array = np.array(image_list) # self.logger.info("done transfer to numpy array! for chunk offset %s" % str(offset)) c.blob.CopyFrom(array2blob(image_list_array)) # self.logger.info("done transfer to blob! for chunk offset %s" % str(offset)) total_frames += len(image_list) del dec # self.logger.info("done process webp! for chunk offset %s" % str(offset)) self.logger.info("this doc has frames: %d! for doc %s" % (total_frames, str(doc.doc_id))) for c in doc.chunks: c.weight /= total_frames
def build_msgs2(self, seed=0): all_msgs = [] num_msg = 20 random.seed(seed) np.random.seed(seed) for j in range(num_msg): msg = gnes_pb2.Message() msg.envelope.client_id = 'abc' for _ in range(random.randint(10, 20)): d = msg.request.index.docs.add() # each doc is about 1MB to 10MB for _ in range(random.randint(10, 20)): c = d.chunks.add() c.embedding.CopyFrom(array2blob(np.random.random([10, 20, 30]))) c.blob.CopyFrom(array2blob(np.random.random([10, 20, 30]))) all_msgs.append(msg) return all_msgs
def test_videoshot_indexer(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) self.indexer.add([0], [doc])
def apply(self, doc: 'gnes_pb2.Document') -> None: super().apply(doc) if len(doc.chunks) > 0: for chunk in doc.chunks: images = blob2array(chunk.blob) if len(images) == 0: self.logger.warning("this chunk has no frame!") elif self.sframes == 1: idx = [int(len(images) / 2)] chunk.blob.CopyFrom(array2blob(images[idx])) elif self.sframes > 0 and len(images) > self.sframes: if len(images) >= 2 * self.sframes: step = math.ceil(len(images) / self.sframes) chunk.blob.CopyFrom(array2blob(images[::step])) else: idx = np.sort( np.random.choice(len(images), self.sframes, replace=False)) chunk.blob.CopyFrom(array2blob(images[idx])) else: self.logger.error('bad document: "doc.chunks" is empty!')
def test_dump_load(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) doc1 = copy.deepcopy(doc) self.indexer.dump(self.dump_path) indexer = BaseDocIndexer.load(self.dump_path) indexer.add([0], [doc1])
def apply(self, doc: 'gnes_pb2.Document') -> None: super().apply(doc) video_frames = [] if doc.WhichOneof('raw_data'): raw_type = type(getattr(doc, doc.WhichOneof('raw_data'))) if doc.raw_bytes: video_frames = video.capture_frames(input_data=doc.raw_bytes, scale=self.frame_size, fps=self.frame_rate, vframes=self.vframes) elif raw_type == gnes_pb2.NdArray: video_frames = blob2array(doc.raw_video) if self.vframes > 0: video_frames = video_frames[0:self.vframes, :].copy() num_frames = len(video_frames) if num_frames > 0: shots = self.detect_shots(video_frames) for ci, frames in enumerate(shots): c = doc.chunks.add() c.doc_id = doc.doc_id c.offset = ci shot_len = len(frames) c.weight = shot_len / num_frames if self.sframes > 0 and shot_len > self.sframes: if shot_len >= 2 * self.sframes: step = math.ceil(shot_len / self.sframes) frames = frames[::step] else: idx = np.sort( np.random.choice(shot_len, self.sframes, replace=False)) frames = [frames[idx_] for idx_ in idx] chunk_data = np.array(frames) c.blob.CopyFrom(array2blob(chunk_data)) else: self.logger.error( 'bad document: "raw_bytes" or "raw_video" is empty!') else: self.logger.error('bad document: "raw_data" is empty!') if self.drop_raw_data: self.logger.info("document raw data will be cleaned!") doc.ClearField('raw_data')
def test_webp_encoder(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO doc.raw_video.CopyFrom(raw_data) self.webp_encoder.apply(doc) doc1 = copy.deepcopy(doc) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) self.webp_encoder.apply(doc) doc2 = copy.deepcopy(doc) self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
def test_empty_service(self): args = set_encoder_parser().parse_args(['--yaml_path', '!TestEncoder {gnes_config: {name: EncoderService, is_trained: true}}']) c_args = _set_client_parser().parse_args([ '--port_in', str(args.port_out), '--port_out', str(args.port_in)]) with ServiceManager(EncoderService, args), ZmqClient(c_args) as client: msg = gnes_pb2.Message() d = msg.request.index.docs.add() d.doc_type = gnes_pb2.Document.IMAGE c = d.chunks.add() c.blob.CopyFrom(array2blob(self.test_numeric)) client.send_message(msg) r = client.recv_message() self.assertEqual(len(r.request.index.docs), 1) self.assertEqual(r.response.index.status, gnes_pb2.Response.SUCCESS)
def test_dump_load(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO doc.raw_video.CopyFrom(raw_data) self.mp4_encoder.apply(doc) doc1 = copy.deepcopy(doc) self.mp4_encoder.dump(self.dump_path) encoder = BaseVideoPreprocessor.load(self.dump_path) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) encoder.apply(doc) doc2 = copy.deepcopy(doc) self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
def test_combine_score_fn(self): from gnes.indexer.chunk.helper import ListKeyIndexer from gnes.indexer.chunk.numpy import NumpyIndexer from gnes.proto import array2blob import numpy as np q_chunk = gnes_pb2.Chunk() q_chunk.doc_id = 2 q_chunk.weight = 0.3 q_chunk.offset = 0 q_chunk.embedding.CopyFrom(array2blob(np.array([3, 3, 3]))) for _fn in [ WeightedChunkOffsetScoreFn, CoordChunkScoreFn, TFIDFChunkScoreFn, BM25ChunkScoreFn ]: indexer = NumpyIndexer(helper_indexer=ListKeyIndexer(), score_fn=_fn()) indexer.add(keys=[(0, 1), (1, 2)], vectors=np.array([[1, 1, 1], [2, 2, 2]]), weights=[0.5, 0.8]) queried_result = indexer.query_and_score(q_chunks=[q_chunk], top_k=2)
def apply(self, doc: 'gnes_pb2.Document') -> None: super().apply(doc) video_frames = [] if doc.WhichOneof('raw_data'): raw_type = type(getattr(doc, doc.WhichOneof('raw_data'))) if doc.raw_bytes: video_frames = video.capture_frames(input_data=doc.raw_bytes, scale=self.frame_size, fps=self.frame_rate, vframes=self.frame_num) elif raw_type == gnes_pb2.NdArray: video_frames = blob2array(doc.raw_video) if self.frame_num > 0: stepwise = len(video_frames) / self.frame_num video_frames = video_frames[0::stepwise, :] num_frames = len(video_frames) if num_frames > 0: shots = self.detect_shots(video_frames) for ci, frames in enumerate(shots): c = doc.chunks.add() c.doc_id = doc.doc_id chunk_data = np.array(frames) c.blob.CopyFrom(array2blob(chunk_data)) c.offset = ci c.weight = len(frames) / num_frames else: self.logger.error( 'bad document: "raw_bytes" or "raw_video" is empty!') else: self.logger.error('bad document: "raw_data" is empty!') if self.drop_raw_data: self.logger.info("document raw data will be cleaned!") doc.ClearField('raw_data')
def test_empty_service(self): args = set_indexer_parser().parse_args([ '--yaml_path', '!BaseChunkIndexer {gnes_config: {name: IndexerService}}' ]) c_args = _set_client_parser().parse_args( ['--port_in', str(args.port_out), '--port_out', str(args.port_in)]) with ServiceManager(IndexerService, args), ZmqClient(c_args) as client: msg = gnes_pb2.Message() d = msg.request.index.docs.add() c = d.chunks.add() c.doc_id = 0 c.embedding.CopyFrom(array2blob(self.test_numeric)) c.offset = 0 c.weight = 1.0 client.send_message(msg) r = client.recv_message() self.assertEqual(r.response.index.status, gnes_pb2.Response.SUCCESS)
def test_array_proto(self): x = np.random.random([5, 4]) blob = array2blob(x) x1 = blob2array(blob) assert_array_equal(x, x1)