def test_new_msg(self): a = gnes_pb2.Message() a.response.index.status = gnes_pb2.Response.SUCCESS print(a) a.request.train.docs.extend([gnes_pb2.Document() for _ in range(2)]) print(a) a.request.train.ClearField('docs') a.request.train.docs.extend([gnes_pb2.Document() for _ in range(3)]) print(a)
def img_process_for_test(dirname): zipfile_ = zipfile.ZipFile(os.path.join(dirname, 'imgs/test.zip')) all_bytes = [zipfile_.open(v).read() for v in zipfile_.namelist()] test_img = [] for raw_bytes in all_bytes: d = gnes_pb2.Document() d.raw_bytes = raw_bytes test_img.append(d) test_img_all_preprocessor = [] pipline_prep1 = PipelinePreprocessor() pipline_prep1.components = lambda: [ UnaryPreprocessor(doc_type=gnes_pb2.Document.IMAGE), ResizeChunkPreprocessor() ] pipline_prep2 = PipelinePreprocessor() pipline_prep2.components = lambda: [ VanillaSlidingPreprocessor(), ResizeChunkPreprocessor() ] for preprocessor in [pipline_prep1, pipline_prep2]: test_img_copy = copy.deepcopy(test_img) for img in test_img_copy: preprocessor.apply(img) test_img_all_preprocessor.append([ blob2array(chunk.blob) for img in test_img_copy for chunk in img.chunks ]) return test_img_all_preprocessor
def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']: """ :param keys: list of doc id :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk) """ res = [] for k in keys: doc = gnes_pb2.Document() target_dirs = os.path.join(self.data_path, str(k)) if not os.path.exists(target_dirs): if self.keep_na_doc: res.append(self._NOT_FOUND) else: with open(os.path.join(target_dirs, '.meta'), 'rb') as f: doc.meta_info = f.read() for raw_file in os.listdir(target_dirs): if not os.path.isdir(raw_file): c = doc.chunks.add() c.doc_id = k self.logger.error( "the query method has not been implemented!") # with open(os.path.join(target_dirs, raw_file), # 'rb') as raw: # c.raw = raw.read() res.append(doc) return res
def test_preprocessor_service_echo(self): args = set_preprocessor_service_parser().parse_args([]) c_args = _set_client_parser().parse_args([ '--port_in', str(args.port_out), '--port_out', str(args.port_in) ]) with PreprocessorService(args), ZmqClient(c_args) as client: msg = gnes_pb2.Message() msg.request.index.docs.extend([gnes_pb2.Document() for _ in range(5)]) client.send_message(msg) r = client.recv_message() print(r) msg.request.train.docs.extend([gnes_pb2.Document() for _ in range(5)]) client.send_message(msg) r = client.recv_message() print(r)
def test_webp_encoder(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO doc.raw_video.CopyFrom(raw_data) self.webp_encoder.apply(doc) doc1 = copy.deepcopy(doc) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) self.webp_encoder.apply(doc) doc2 = copy.deepcopy(doc) self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
def test_videoshot_indexer(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) self.indexer.add([0], [doc])
def init_db(self): self.db = DirectoryIndexer(self.data_path) self.d = gnes_pb2.Document() self.d.doc_id = 0 self.d.raw_bytes = self.video_bytes[0] preprocess = BasePreprocessor.load_yaml(self.pipeline_yml_path) preprocess.apply(self.d) self.db.add(list(range(len(self.video_bytes))), [self.d])
def test_dump_load(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO doc.raw_video.CopyFrom(raw_data) self.mp4_encoder.apply(doc) doc1 = copy.deepcopy(doc) self.mp4_encoder.dump(self.dump_path) encoder = BaseVideoPreprocessor.load(self.dump_path) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) encoder.apply(doc) doc2 = copy.deepcopy(doc) self.assertEqual(doc1.raw_bytes, doc2.chunks[0].raw)
def test_gif_pipelinepreproces(self): d = gnes_pb2.Document() d.raw_bytes = self.video_bytes[0] d_ = copy.deepcopy(d) p3 = FFmpegVideoSegmentor.load_yaml(self.ffmpeg_yaml_path) p3.apply(d) p4 = BasePreprocessor.load_yaml(self.pipeline_path) p4.apply(d_) self.assertEqual(len(d.chunks), len(d_.chunks))
def setUp(self) -> None: self.doc = gnes_pb2.Document() c1 = self.doc.chunks.add() c1.blob.CopyFrom(array2blob(np.array([[1, 2, 3], [2, 3, 4]]))) c2 = self.doc.chunks.add() c2.blob.CopyFrom( array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3]]))) c3 = self.doc.chunks.add() c3.blob.CopyFrom( array2blob(np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]])))
def test_dump_load(self): raw_data = array2blob(self.video_frames) doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO chunk = doc.chunks.add() chunk.blob.CopyFrom(raw_data) doc1 = copy.deepcopy(doc) self.indexer.dump(self.dump_path) indexer = BaseDocIndexer.load(self.dump_path) indexer.add([0], [doc1])
def test_pipelinepreproces(self): p3 = PipelinePreprocessor() p3.components = lambda: [P1(), P2()] d = gnes_pb2.Document() d.doc_id = 1 p3.apply(d) self.assertEqual(d.doc_id, 6) p3.name = self.p3_name p3.dump_yaml() p3.dump() p4 = BasePreprocessor.load_yaml(p3.yaml_full_path) p4.apply(d) self.assertEqual(d.doc_id, 21)
def img_process_for_test(dirname): zipfile_ = zipfile.ZipFile(os.path.join(dirname, 'imgs/test.zip'), "r") all_bytes = [zipfile_.open(v).read() for v in zipfile_.namelist()] test_img = [] for raw_bytes in all_bytes: d = gnes_pb2.Document() d.raw_bytes = raw_bytes test_img.append(d) test_img_all_preprocessor = [] for preprocessor in [BaseSingletonPreprocessor(doc_type=gnes_pb2.Document.IMAGE), VanillaSlidingPreprocessor()]: test_img_copy = copy.deepcopy(test_img) for img in test_img_copy: preprocessor.apply(img) test_img_all_preprocessor.append([blob2array(chunk.blob) for img in test_img_copy for chunk in img.chunks]) return test_img_all_preprocessor
def test_map_router(self): args = set_router_parser().parse_args([ '--yaml_path', self.batch_router_yaml, ]) c_args = _set_client_parser().parse_args([ '--port_in', str(args.port_out), '--port_out', str(args.port_in), ]) with RouterService(args), ZmqClient(c_args) as c1: msg = gnes_pb2.Message() msg.request.index.docs.extend([gnes_pb2.Document() for _ in range(5)]) c1.send_message(msg) r = c1.recv_message() self.assertEqual(len(r.request.index.docs), 2) r = c1.recv_message() self.assertEqual(len(r.request.index.docs), 2) r = c1.recv_message() self.assertEqual(len(r.request.index.docs), 1)
def line2pb_doc(line: str, doc_id: int = 0, deliminator: str = r'[.。!?!?]+') -> 'gnes_pb2.Document': doc = gnes_pb2.Document() doc.doc_id = doc_id doc.doc_type = gnes_pb2.Document.TEXT doc.meta_info = line.encode() if deliminator: for ci, s in enumerate(re.split(deliminator, line)): if s.strip(): c = doc.chunks.add() c.doc_id = doc_id c.text = s c.offset_1d = ci else: c = doc.chunks.add() c.doc_id = doc_id c.text = line c.offset_1d = 0 return doc
def test_publish_router(self): args = set_router_parser().parse_args([ '--yaml_path', self.publish_router_yaml, '--socket_out', str(SocketType.PUB_BIND) ]) c_args = _set_client_parser().parse_args([ '--port_in', str(args.port_out), '--port_out', str(args.port_in), '--socket_in', str(SocketType.SUB_CONNECT) ]) with RouterService(args), ZmqClient(c_args) as c1, ZmqClient(c_args) as c2: msg = gnes_pb2.Message() msg.request.index.docs.extend([gnes_pb2.Document() for _ in range(5)]) msg.envelope.num_part.append(1) c1.send_message(msg) r = c1.recv_message() self.assertSequenceEqual(r.envelope.num_part, [1, 2]) r = c2.recv_message() self.assertSequenceEqual(r.envelope.num_part, [1, 2])
def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']: self.logger.error(keys) return [Parse(self._content[k], gnes_pb2.Document()) for k in keys]
def test_empty_doc(self): doc = gnes_pb2.Document() doc.doc_type = gnes_pb2.Document.VIDEO self.mp4_encoder.apply(doc)
def test_emtpy_document(self): frame_selector = FrameSelectPreprocessor(sframes=-1) frame_selector.apply(gnes_pb2.Document())
def test_doc_combine_score_fn(self): from gnes.indexer.doc.dict import DictIndexer document_list = [] document_id_list = [] for j in range(1, 4): d = gnes_pb2.Document() for i in range(1, 4): c = d.chunks.add() c.doc_id = j c.offset = i c.weight = 1 / 3 document_id_list.append(j) document_list.append(d) self.chunk_router_yaml = 'Chunk2DocTopkReducer' args = set_router_parser().parse_args([ '--yaml_path', self.chunk_router_yaml, '--socket_out', str(SocketType.PUB_BIND) ]) c_args = _set_client_parser().parse_args([ '--port_in', str(args.port_out), '--port_out', str(args.port_in), '--socket_in', str(SocketType.SUB_CONNECT) ]) with RouterService(args), ZmqClient(c_args) as c1: msg = gnes_pb2.Message() s = msg.response.search.topk_results.add() s.score.value = 0.1 s.score.explained = '"1-c1"' s.chunk.doc_id = 1 s = msg.response.search.topk_results.add() s.score.value = 0.2 s.score.explained = '"1-c2"' s.chunk.doc_id = 2 s = msg.response.search.topk_results.add() s.score.value = 0.3 s.score.explained = '"1-c3"' s.chunk.doc_id = 1 msg.envelope.num_part.extend([1, 2]) c1.send_message(msg) msg.response.search.ClearField('topk_results') s = msg.response.search.topk_results.add() s.score.value = 0.2 s.score.explained = '"2-c1"' s.chunk.doc_id = 1 s = msg.response.search.topk_results.add() s.score.value = 0.2 s.score.explained = '"2-c2"' s.chunk.doc_id = 2 s = msg.response.search.topk_results.add() s.score.value = 0.3 s.score.explained = '"2-c3"' s.chunk.doc_id = 3 c1.send_message(msg) r = c1.recv_message() doc_indexer = DictIndexer(score_fn=CoordDocScoreFn()) doc_indexer.add(keys=document_id_list, docs=document_list) queried_result = doc_indexer.query_and_score( docs=r.response.search.topk_results, top_k=2)