def test_batching_text_one_argument(stack, crafter): docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs.extract_docs('text', stack_contents=stack) crafted_docs = crafter.craft(texts) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted'
def test_match2docranker_batching_flow(ranker, mocker): NUM_DOCS_QUERIES = 15 NUM_MATCHES = 10 queries = DocumentArray([]) for i in range(NUM_DOCS_QUERIES): query = Document(id=f'query-{i}') for j in range(NUM_MATCHES): m = Document(id=f'match-{i}-{j}', tags={'dummy_score': j}) query.matches.append(m) queries.append(query) def validate_response(resp): assert len(resp.search.docs) == NUM_DOCS_QUERIES for i, query in enumerate(resp.search.docs): for j, match in enumerate(query.matches, 1): assert match.id == f'match-{i}-{NUM_MATCHES - j}' assert match.score.value == NUM_MATCHES - j mock = mocker.Mock() with Flow().add(name='ranker', uses=ranker) as f: f.search(inputs=queries, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_response)
def test_get_content_multiple_fields_merge(stack, num_rows): fields = ['embedding', 'text'] batch_size = 10 embed_size = 20 kwargs = { field: np.random.random((num_rows, embed_size)) if field == 'embedding' else 'text' for field in fields } docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(*fields, stack_contents=stack) assert len(contents) == len(fields) assert isinstance(contents, list) if stack: assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size if stack: assert contents[0].shape == (batch_size, num_rows, embed_size) assert contents[1].shape == (batch_size,) else: assert len(contents[0]) == batch_size assert len(contents[1]) == batch_size for c in contents[0]: assert c.shape == (num_rows, embed_size)
def test_batching_blob_one_argument(stack, crafter): docs = DocumentArray( [Document(blob=np.array([[i] * 5, [i] * 5])) for i in range(15)]) texts, _ = docs.extract_docs('blob', stack_contents=stack) crafted_docs = crafter.craft(texts) for i, crafted_doc in enumerate(crafted_docs): np.testing.assert_equal(crafted_doc['blob'], np.array([[i] * 5, [i] * 5]))
def test_batching_text_one_argument(segmenter): docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs.extract_docs('text') chunks_sets = segmenter.segment(texts) for i, chunks in enumerate(chunks_sets): assert len(chunks) == NUM_CHUNKS for j, chunk in enumerate(chunks): assert chunk['text'] == f'text-{i}-chunk-{j}'
def test_union(docarray, document_factory): additional_docarray = DocumentArray([]) for idx in range(4, 10): doc = document_factory.create(idx, f'test {idx}') additional_docarray.add(doc) union = docarray + additional_docarray for idx in range(0, 3): assert union[idx].id == docarray[idx].id for idx in range(0, 6): assert union[idx + 3].id == additional_docarray[idx].id
def test_batching_text_multi(stack, crafter): docs = DocumentArray( [Document(text=f'text-{i}', id=f'id-{i}') for i in range(15)]) required_keys = ['text', 'id'] text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted' assert crafted_doc['id'] == f'id-{i}-crafted'
def test_batching_mix_multi(stack, crafter): docs = DocumentArray([ Document(text=f'text-{i}', embedding=np.array([i] * 5)) for i in range(15) ]) required_keys = ['text', 'embedding'] text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted' np.testing.assert_equal(crafted_doc['embedding'], np.array([i] * 5))
def test_get_content_text_fields(stack, field): batch_size = 10 kwargs = {field: 'text'} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(field, stack_contents=stack) if stack: assert isinstance(contents, np.ndarray) assert contents.shape == (batch_size,) assert len(contents) == batch_size for content in contents: assert content == 'text'
def test_get_content_bytes_fields(stack, bytes_input, field): batch_size = 10 kwargs = {field: bytes_input} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(field, stack_contents=stack) assert len(contents) == batch_size assert isinstance(contents, list) for content in contents: assert isinstance(content, bytes) assert content == bytes_input
def test_batching_encode_text(encoder): docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs.all_contents embeds = encoder.encode(texts) assert embeds.shape == (15, 10)
def test_as_blob_driver(): docs = DocumentArray(random_docs(2)) driver = MockPrediction2DocBlobDriver() driver._apply_all(docs) for d in docs: assert NdArray(d.blob).value.shape == (3, )
def test_batching_blob_multi(stack, crafter): docs = DocumentArray([ Document( blob=np.array([[i] * 5, [i] * 5]), embedding=np.array([i] * 5), ) for i in range(15) ]) required_keys = ['blob', 'embedding'] text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): np.testing.assert_equal(crafted_doc['blob'], np.array([[i] * 5, [i] * 5])) np.testing.assert_equal(crafted_doc['embedding'], np.array([i] * 5))
def test_segment_driver(segment_driver, text_segmenter_executor): valid_doc = Document() valid_doc.text = 'valid' valid_doc.mime_type = 'image/png' segment_driver.attach(executor=text_segmenter_executor, runtime=None) segment_driver._apply_all(DocumentArray([valid_doc])) assert valid_doc.chunks[0].tags['id'] == 3 assert valid_doc.chunks[0].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0, 0.0])) assert valid_doc.chunks[0].weight == 0.0 assert valid_doc.chunks[0].mime_type == 'text/plain' assert valid_doc.chunks[1].tags['id'] == 4 assert valid_doc.chunks[1].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0, 1.0])) assert valid_doc.chunks[1].weight == 1.0 assert valid_doc.chunks[1].mime_type == 'image/png' assert valid_doc.chunks[2].tags['id'] == 5 assert valid_doc.chunks[2].parent_id == valid_doc.id np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0, 2.0])) assert valid_doc.chunks[2].weight == 2.0 assert valid_doc.chunks[2].mime_type == 'image/png'
def test_get_content(stack, num_rows, field): batch_size = 10 embed_size = 20 kwargs = {field: np.random.random((num_rows, embed_size))} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) docs.append(Document()) contents, pts = docs.extract_docs(field, stack_contents=stack) if stack: assert isinstance(contents, np.ndarray) assert contents.shape == (batch_size, num_rows, embed_size) else: assert len(contents) == batch_size for content in contents: assert content.shape == (num_rows, embed_size)
def test_multi_label_predict_driver(): docs = DocumentArray(random_docs(2)) driver = MockMultiLabelPredictDriver(labels=['cat', 'dog', 'human']) driver._apply_all(docs) for d in docs: assert isinstance(d.tags['prediction'], ListValue) for t in d.tags['prediction']: assert t in {'cat', 'dog', 'human'} docs = DocumentArray(random_docs(2)) driver = MockAllLabelPredictDriver(labels=['cat', 'dog', 'human']) driver._apply_all(docs) for d in docs: assert isinstance(d.tags['prediction'], ListValue) assert list(d.tags['prediction']) == ['cat', 'dog', 'human']
def test_batching_encode_blob(encoder): docs = DocumentArray( [Document(blob=np.random.random((10, 20))) for _ in range(15)]) blob, _ = docs.all_contents embeds = encoder.encode(blob) assert embeds.shape == (15, 10)
def test_binary_predict_driver(): docs = DocumentArray(random_docs(2)) driver = MockBinaryPredictDriver() driver._apply_all(docs) for d in docs: assert d.tags['prediction'] in {'yes', 'no'} for c in d.chunks: assert c.tags['prediction'] in {'yes', 'no'}
def test_one_hot_predict_driver(): docs = DocumentArray(random_docs(2)) driver = MockOneHotPredictDriver(labels=['cat', 'dog', 'human']) driver._apply_all(docs) for d in docs: assert d.tags['prediction'] in {'cat', 'dog', 'human'} for c in d.chunks: assert c.tags['prediction'] in {'cat', 'dog', 'human'}
def test_doc_array_from_generator(): NUM_DOCS = 100 def generate(): for _ in range(NUM_DOCS): yield Document() doc_array = DocumentArray(generate()) assert len(doc_array) == NUM_DOCS
def test_broken_document(segment_driver, text_segmenter_executor): segment_driver.attach(executor=text_segmenter_executor, runtime=None) invalid_doc = Document() invalid_doc.id = 1 invalid_doc.text = 'invalid' with pytest.raises(AttributeError): segment_driver._apply_all([DocumentArray([invalid_doc])])
def test_get_content_multiple_fields_text(stack, fields): batch_size = 10 kwargs = {field: f'text-{field}' for field in fields} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(*fields, stack_contents=stack) assert len(contents) == len(fields) assert isinstance(contents, list) if stack: assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size if stack: assert content.shape == (batch_size,)
def test_image_segmenter(segment_driver, image_segmenter_executor): blob1 = np.random.random((1, 32, 64)) blob2 = np.random.random((1, 64, 32)) docs = DocumentArray([Document(blob=blob1), Document(blob=blob2)]) segment_driver.attach(executor=image_segmenter_executor, runtime=None) segment_driver._apply_all(docs) for doc in docs: assert len(doc.chunks) == 1 np.testing.assert_equal(docs[0].chunks[0].blob, blob1) np.testing.assert_equal(docs[1].chunks[0].blob, blob2)
def docarray_with_scipy_sparse_embedding(docs): embedding = coo_matrix( ( np.array([1, 2, 3, 4, 5, 6]), (np.array([0, 0, 0, 0, 0, 0]), np.array([0, 2, 2, 0, 1, 2])), ), shape=(1, 10), ) for doc in docs: doc.embedding = embedding return DocumentArray(docs)
def test_get_content_multiple_fields_text_buffer(stack, bytes_input): batch_size = 10 fields = ['id', 'buffer'] kwargs = {'id': 'text', 'buffer': bytes_input} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(*fields, stack_contents=stack) assert len(contents) == len(fields) assert isinstance(contents, list) assert len(contents[0]) == batch_size if stack: assert isinstance(contents[0], np.ndarray) assert contents[0].shape == (batch_size,) assert isinstance(contents[1], list) assert isinstance(contents[1][0], bytes) for content in contents: assert len(content) == batch_size
def test_array_get_from_slice_success(docs, document_factory): docarray = DocumentArray(docs) assert len(docarray[:1]) == 1 assert len(docarray[:2]) == 2 assert len(docarray[:3]) == 3 assert len(docarray[:100]) == 3 assert len(docarray[1:]) == 2 assert len(docarray[2:]) == 1 assert len(docarray[3:]) == 0 assert len(docarray[100:]) == 0
def test_match2docranker_batching(ranker): NUM_DOCS_QUERIES = 15 NUM_MATCHES = 10 old_matches_scores = [] queries_metas = [] matches_metas = [] queries = DocumentArray([]) for i in range(NUM_DOCS_QUERIES): old_match_scores = [] match_metas = [] query = Document(id=f'query-{i}') for j in range(NUM_MATCHES): m = Document(id=f'match-{i}-{j}', tags={'dummy_score': j}) query.matches.append(m) old_match_scores.append(0) match_metas.append(m.get_attrs('tags__dummy_score')) queries.append(query) old_matches_scores.append(old_match_scores) queries_metas.append(None) matches_metas.append(match_metas) queries_scores = ranker.score(old_matches_scores, queries_metas, matches_metas) assert len(queries_scores) == NUM_DOCS_QUERIES for i, (query, matches_scores) in enumerate(zip(queries, queries_scores)): assert len(matches_scores) == NUM_MATCHES for j, (match, score) in enumerate(zip(query.matches, matches_scores)): match.score = NamedScore(value=j) assert score == j query.matches.sort(key=lambda x: x.score.value, reverse=True) for j, match in enumerate(query.matches, 1): assert match.id == f'match-{i}-{NUM_MATCHES - j}' assert match.score.value == NUM_MATCHES - j
def test_collect_matches2doc_ranker_driver_mock_ranker(): doc = create_document_to_score_same_depth_level() driver = SimpleCollectMatchesRankDriver(docs=DocumentArray([doc])) executor = MockLengthRanker() driver.attach(executor=executor, runtime=None) driver() dm = list(doc.matches) assert len(dm) == 2 assert dm[0].id == '20' assert dm[0].score.value == 3.0 assert dm[1].id == '30' assert dm[1].score.value == 2.0 for match in dm: # match score is computed w.r.t to doc.id assert match.score.ref_id == doc.id
def test_batching_text_one_argument_flow(crafter, mocker): NUM_DOCS = 15 def validate_response(resp): assert len(resp.index.docs) == NUM_DOCS for i, doc in enumerate(resp.index.docs): assert doc.text == f'text-{i}-crafted' docs = DocumentArray([Document(text=f'text-{i}') for i in range(NUM_DOCS)]) mock = mocker.Mock() with Flow().add(name='crafter', uses=crafter) as f: f.index(inputs=docs, on_done=mock) mock.assert_called_once() validate_callback(mock, validate_response)
def test_chunks_exist_already(segment_driver, text_segmenter_executor): document = Document( text='valid', chunks=[Document(text='test2'), Document(text='test3')]) # before segmentation assert len(document.chunks) == 2 for chunk in document.chunks: assert chunk.parent_id == document.id assert chunk.siblings == 2 segment_driver.attach(executor=text_segmenter_executor, runtime=None) segment_driver._apply_all(DocumentArray([document])) # after segmentation assert len(document.chunks) == 5 for chunk in document.chunks: assert chunk.parent_id == document.id assert chunk.siblings == 5