def create_document(doc_id, text, weight, length): d = Document() d._document.id = (str(doc_id) * 16)[:16] d.buffer = text.encode('utf8') d.weight = weight d.length = length return d
def search_generator(path: str, buffer: bytes): d = Document() if buffer: d.buffer = buffer if path: d.content = path yield d
def input_index_data(num_docs=None, batch_size=8, dataset_type='f30k'): captions = 'dataset_flickr30k.json' if dataset_type == 'f30k' else 'captions.txt' if dataset_type == 'toy-data': base_folder = '.' else: base_folder = 'data' data_loader = get_data_loader( root=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/images'), captions=os.path.join(cur_dir, f'{base_folder}/{dataset_type}/{captions}'), split='test', batch_size=batch_size, dataset_type=dataset_type) for i, (images, captions) in enumerate(data_loader): for image, caption in zip(images, captions): hashed = hashlib.sha1(image).hexdigest() document_img = Document() document_img.buffer = image document_img.modality = 'image' document_img.mime_type = 'image/jpeg' document_caption = Document(id=hashed) document_caption.text = caption document_caption.modality = 'text' document_caption.mime_type = 'text/plain' document_caption.tags['id'] = caption yield document_img yield document_caption if num_docs and (i + 1) * batch_size >= num_docs: break
def create_document(doc_id, text, weight, length): d = Document() d.id = doc_id d.buffer = text.encode('utf8') d.weight = weight d.length = length return d
def create(self): gt = Document() if field_type == 'text': gt.text = 'aaaa' elif field_type == 'buffer': gt.buffer = b'\x01\x02\x03\04' elif field_type == 'blob': gt.blob = np.array([1, 1, 1, 1]) return gt
def create(self): doc = Document() if field_type == 'text': doc.text = 'aaa' elif field_type == 'buffer': doc.buffer = b'\x01\x02\x03' elif field_type == 'blob': doc.blob = np.array([1, 1, 1]) return doc
def request(field_type): num_docs = 10 req = jina_pb2.RequestProto() for idx in range(num_docs): doc = req.index.docs.add() gt = req.index.groundtruths.add() chunk_doc = Document(doc.chunks.add()) chunk_gt = Document(gt.chunks.add()) chunk_doc.granularity = 1 chunk_gt.granularity = 1 if field_type == 'text': chunk_doc.text = 'aaa' chunk_gt.text = 'aaaa' elif field_type == 'buffer': chunk_doc.buffer = b'\x01\x02\x03' chunk_gt.buffer = b'\x01\x02\x03\x04' elif field_type == 'blob': chunk_doc.blob = np.array([1, 1, 1]) chunk_gt.blob = np.array([1, 1, 1, 1]) return Request(req).as_typed_request('index')
def create_document(doc_id, text, weight): d = Document() d.id = str(doc_id) d.buffer = text.encode('utf8') d.weight = weight return d