def test_convert_uri_to_data_uri(uri, mimetype): doc = Document(uri=uri, mime_type=mimetype) doc.convert_uri_to_datauri() assert doc.uri.startswith(f'data:{mimetype}') assert doc.mime_type == mimetype
def add_chunk(doc): with Document() as chunk: chunk.granularity = doc.granularity + 1 chunk.adjacency = doc.adjacency doc.chunks.append(chunk) return chunk
def index(self, docs, **kwargs): for path_docs in docs.traverse(self._traversal_paths): for doc in path_docs: for i in range(5): doc.matches.append(Document())
def document_generator(num_doc): for _ in range(num_doc): doc = Document(content='hello') groundtruth_doc = Document(content='hello') yield doc, groundtruth_doc
def test_crud_in_readme(mocker): docs = [ Document(id='🐲', embedding=np.array([0, 0]), tags={ 'guardian': 'Azure Dragon', 'position': 'East' }), Document(id='🐦', embedding=np.array([1, 0]), tags={ 'guardian': 'Vermilion Bird', 'position': 'South' }), Document(id='🐢', embedding=np.array([0, 1]), tags={ 'guardian': 'Black Tortoise', 'position': 'North' }), Document(id='🐯', embedding=np.array([1, 1]), tags={ 'guardian': 'White Tiger', 'position': 'West' }) ] # create mock = mocker.Mock() with Flow().add(uses='_index') as f: f.index(docs, on_done=mock) mock.assert_called_once() # read def validate(req): assert len(req.docs[0].matches) == 3 for match in req.docs[0].matches: assert match.id != '🐯' assert 'position' in match.tags assert 'guardian' in match.tags assert match.score.ref_id == req.docs[0].id mock = mocker.Mock() with f: f.search(docs[0], top_k=3, on_done=mock) validate_callback(mock, validate) # update mock = mocker.Mock() d = docs[0] d.embedding = np.array([1, 1]) with f: f.update(d, on_done=mock) mock.assert_called_once() # search again def validate(req): assert len(req.docs[0].matches) == 1 req.docs[0].matches[0].id = req.docs[0].id # embeddings are removed in the CompoundIndexer via ExcludeQL np.testing.assert_array_equal(req.docs[0].matches[0].embedding, np.array(None)) mock = mocker.Mock() with f: f.search(docs[0], top_k=1, on_done=mock) validate_callback(mock, validate) # delete mock = mocker.Mock() with f: f.delete(['🐦', '🐲'], on_done=mock) mock.assert_called_once() # search again def validate(req): assert len(req.docs[0].matches) == 2 mock = mocker.Mock() with f: f.search(docs[0], top_k=4, on_done=mock) validate_callback(mock, validate)
async def async_inputs(): for _ in range(20): yield Document(text='client0-Request')
def test_crud_in_readme(mocker): docs = [ Document(id='🐲', embedding=np.array([0, 0]), tags={ 'guardian': 'Azure Dragon', 'position': 'East' }), Document(id='🐦', embedding=np.array([1, 0]), tags={ 'guardian': 'Vermilion Bird', 'position': 'South' }), Document(id='🐢', embedding=np.array([0, 1]), tags={ 'guardian': 'Black Tortoise', 'position': 'North' }), Document(id='🐯', embedding=np.array([1, 1]), tags={ 'guardian': 'White Tiger', 'position': 'West' }) ] # create m = mocker.Mock() with Flow().add(uses='_index') as f: f.index(docs, on_done=m) m.assert_called_once() # read def validate(req): assert len(req.docs[0].matches) == 3 for m in req.docs[0].matches: assert m.id != '🐯' assert 'position' in m.tags assert 'guardian' in m.tags m = mocker.Mock(wrap=validate) with f: f.search(docs[0], top_k=3, on_done=m) m.assert_called_once() # update m = mocker.Mock() d = docs[0] d.embedding = np.array([1, 1]) with f: f.update(d, on_done=m) m.assert_called_once() # search again def validate(req): assert len(req.docs[0].matches) == 1 req.docs[0].matches[0].id = req.docs[0].id np.testing.assert_array_equal(req.docs[0].matches[0].embedding, docs[0].embedding) m = mocker.Mock(wrap=validate) with f: f.search(docs[0], top_k=1, on_done=m) m.assert_called_once() # delete m = mocker.Mock() with f: f.delete(['🐦', '🐲'], on_done=m) m.assert_called_once() # search again def validate(req): assert len(req.docs[0].matches) == 2 m = mocker.Mock(wrap=validate) with f: f.search(docs[0], top_k=4, on_done=m) m.assert_called_once()
def craft(self, docs: DocumentArray, **kwargs): for doc in docs: doc.chunks.append(Document(doc, copy=True, tags={'root_doc_id': doc.id})) return docs
def craft(self, docs: DocumentArray, **kwargs): for doc in docs: doc.convert_image_uri_to_blob() doc.chunks.append(Document(blob=doc.blob, mime_type='image/*')) return docs
def test_image_normalize(shape, channel_axis): doc = Document(content=np.random.randint(0, 255, shape, dtype=np.uint8)) doc.set_image_blob_normalization(channel_axis=channel_axis) assert doc.blob.ndim == 3 assert doc.blob.shape == shape assert doc.blob.dtype == np.float32
def test_simple_routing(): f = Flow().add(uses=SimplExecutor) with f: results = f.post(on='/index', inputs=[Document()], return_results=True) assert results[0].docs[0].text == 'Hello World!'
def test_datauri_to_blob(): doc = Document(uri=os.path.join(cur_dir, 'test.png')) doc.convert_uri_to_datauri() assert not doc.blob assert doc.mime_type == 'image/png'
def test_uri_to_blob(): doc = Document(uri=os.path.join(cur_dir, 'test.png')) doc.convert_uri_to_image_blob() assert isinstance(doc.blob, np.ndarray) assert doc.mime_type == 'image/png' assert doc.blob.shape == (85, 152, 3) # h,w,c
def test_glb_converters(): doc = Document(uri=os.path.join(cur_dir, 'test.glb')) doc.convert_uri_to_point_cloud_blob(2000) assert doc.blob.shape == (2000, 3)
def random_docs(start, end, embed_dim=10): for j in range(start, end): d = Document() d.id = f'{j:0>16}' d.tags['id'] = j for i in range(5): c = Document() c.id = f'{j:0>16}' d.text = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([embed_dim]) d.chunks.append(c) d.text = ''.join( random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8') d.embedding = np.random.random([embed_dim]) yield d
def segment(self, docs: DocumentArray, **kwargs): for doc in docs: doc.chunks += [ Document(text=t, mime_type='text/plain', tags={'root_doc_id': doc.tags['root_doc_id']}) for t in doc.text.split('\n')] return docs
def test_single_document_flow_index(): d = Document() with Flow().add() as f: f.index(d) f.index(lambda: d)
def test_crud(tmpdir, rest): os.environ['RESTFUL'] = 'http' if rest else 'grpc' os.environ['WORKSPACE'] = str(tmpdir) with Flow.load_config('flow.yml') as f: c = Client(port=f.port_expose) original_docs = list(random_docs(10, chunks_per_doc=0)) if rest: rest_post(f, 'index', original_docs) else: c.post( on='/index', inputs=original_docs, ) with Flow.load_config('flow.yml') as f: c = Client(port=f.port_expose) inputs = list(random_docs(1)) if rest: results = rest_post(f, 'search', inputs) matches = results['data'][0]['matches'] for doc in results['data']: assert Document.from_dict(doc).text == 'hello world' else: results = c.post(on='/search', inputs=inputs, parameters=PARAMS, return_results=True) matches = results[0].docs[0].matches for doc in results[0].docs: assert doc.text == 'hello world' assert len(matches) == 10 with Flow.load_config('flow.yml') as f: c = Client(port=f.port_expose) inputs = list(random_docs(5, chunks_per_doc=0)) if rest: rest_post(f, 'delete', inputs) else: c.post(on='/delete', inputs=inputs) with Flow.load_config('flow.yml') as f: c = Client(port=f.port_expose) inputs = list(random_docs(1)) if rest: results = rest_post(f, 'search', inputs) matches = results['data'][0]['matches'] else: results = c.post(on='/search', inputs=inputs, parameters=PARAMS, return_results=True) matches = results[0].docs[0].matches assert len(matches) == 5 updated_docs = list( random_docs(5, chunks_per_doc=5, start_id=5, text='hello again')) with Flow.load_config('flow.yml') as f: c = Client(port=f.port_expose) if rest: rest_post(f, 'update', updated_docs) else: c.post(on='/update', inputs=updated_docs) with Flow.load_config('flow.yml') as f: c = Client(port=f.port_expose) inputs = list(random_docs(1)) if rest: results = rest_post(f, 'search', inputs) matches = sorted(results['data'][0]['matches'], key=lambda match: match['id']) else: results = c.post(on='/search', inputs=inputs, parameters=PARAMS, return_results=True) matches = sorted(results[0].docs[0].matches, key=lambda match: match.id) assert len(matches) == 5 for match, updated_doc in zip(matches, updated_docs): if isinstance(match, dict): match = Document.from_dict(match) assert updated_doc.id == match.id assert updated_doc.text == match.text np.testing.assert_array_equal(updated_doc.embedding, match.embedding) assert len(match.chunks) == 5 assert len(match.chunks) == len(updated_doc.chunks) for match_chunk, updated_doc_chunk in zip(match.chunks, updated_doc.chunks): assert match_chunk.text == updated_doc_chunk.text np.testing.assert_array_equal(match_chunk.embedding, updated_doc_chunk.embedding)
def add(self, **kwargs): return DocumentArray( [Document(text='executor was here') for _ in range(100)])
def test_traverse_flat_root_itself(): da = DocumentArray([Document() for _ in range(100)]) res = da.traverse_flat('r') assert id(res) == id(da)
def get_doc(i): return Document(text=f'doc {i}', embedding=np.array([i] * 5))
def random_docs_only_tags(nr_docs, start=0): for j in range(start, nr_docs + start): d = Document() d.tags['id'] = j d.tags['something'] = f'abcdef {j}' yield d
def doc_to_index(): doc = Document() doc.text = 'test' return doc
def random_docs_with_shapes(nr_docs, emb_shape, start=0): for i in range(start, nr_docs + start): with Document() as d: d.id = i d.embedding = np.random.random(emb_shape) yield d
def create(self, idx, text): with Document(id=idx) as d: d.tags['id'] = idx d.text = text return d
def docs(): d = Document() d.tags = INPUT_TAGS return [d]
def add_match(doc): with Document() as match: match.granularity = doc.granularity match.adjacency = doc.adjacency + 1 doc.matches.add(match) return match
def test_copy_tags(docs): for d in docs: _d = Document(tags=d.tags) assert _d.tags == d.tags
def doc_with_multimodal_chunks_wrong(embeddings): doc = MultimodalDocument() chunk1 = Document() chunk2 = Document() chunk3 = Document() chunk1.modality = 'visual' chunk2.modality = 'visual' chunk3.modality = 'textual' chunk1.embedding = embeddings[0] chunk2.embedding = embeddings[1] chunk3.embedding = embeddings[2] chunk1.update_id() chunk2.update_id() chunk3.update_id() doc.update_id() doc.chunks.append(chunk1) doc.chunks.append(chunk2) doc.chunks.append(chunk3) return doc
def test_convert_content_to_uri(): d = Document(content=np.random.random([10, 10])) with pytest.raises(NotImplementedError): d.convert_content_to_uri()