def test_get_content_multiple_fields_merge(stack, num_rows): fields = ['embedding', 'text'] batch_size = 10 embed_size = 20 kwargs = { field: np.random.random((num_rows, embed_size)) if field == 'embedding' else 'text' for field in fields } docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(*fields, stack_contents=stack) assert len(contents) == len(fields) assert isinstance(contents, list) if stack: assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size if stack: assert contents[0].shape == (batch_size, num_rows, embed_size) assert contents[1].shape == (batch_size,) else: assert len(contents[0]) == batch_size assert len(contents[1]) == batch_size for c in contents[0]: assert c.shape == (num_rows, embed_size)
def test_batching_text_one_argument(stack, crafter): docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs.extract_docs('text', stack_contents=stack) crafted_docs = crafter.craft(texts) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted'
def test_batching_blob_one_argument(stack, crafter): docs = DocumentArray( [Document(blob=np.array([[i] * 5, [i] * 5])) for i in range(15)]) texts, _ = docs.extract_docs('blob', stack_contents=stack) crafted_docs = crafter.craft(texts) for i, crafted_doc in enumerate(crafted_docs): np.testing.assert_equal(crafted_doc['blob'], np.array([[i] * 5, [i] * 5]))
def test_batching_text_one_argument(segmenter): docs = DocumentArray([Document(text=f'text-{i}') for i in range(15)]) texts, _ = docs.extract_docs('text') chunks_sets = segmenter.segment(texts) for i, chunks in enumerate(chunks_sets): assert len(chunks) == NUM_CHUNKS for j, chunk in enumerate(chunks): assert chunk['text'] == f'text-{i}-chunk-{j}'
def test_batching_text_multi(stack, crafter): docs = DocumentArray( [Document(text=f'text-{i}', id=f'id-{i}') for i in range(15)]) required_keys = ['text', 'id'] text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted' assert crafted_doc['id'] == f'id-{i}-crafted'
def test_batching_mix_multi(stack, crafter): docs = DocumentArray([ Document(text=f'text-{i}', embedding=np.array([i] * 5)) for i in range(15) ]) required_keys = ['text', 'embedding'] text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): assert crafted_doc['text'] == f'text-{i}-crafted' np.testing.assert_equal(crafted_doc['embedding'], np.array([i] * 5))
def test_get_content_bytes_fields(stack, bytes_input, field): batch_size = 10 kwargs = {field: bytes_input} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(field, stack_contents=stack) assert len(contents) == batch_size assert isinstance(contents, list) for content in contents: assert isinstance(content, bytes) assert content == bytes_input
def test_get_content_text_fields(stack, field): batch_size = 10 kwargs = {field: 'text'} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(field, stack_contents=stack) if stack: assert isinstance(contents, np.ndarray) assert contents.shape == (batch_size,) assert len(contents) == batch_size for content in contents: assert content == 'text'
def test_batching_blob_multi(stack, crafter): docs = DocumentArray([ Document( blob=np.array([[i] * 5, [i] * 5]), embedding=np.array([i] * 5), ) for i in range(15) ]) required_keys = ['blob', 'embedding'] text_ids, _ = docs.extract_docs(*required_keys, stack_contents=stack) crafted_docs = crafter.craft(*text_ids) for i, crafted_doc in enumerate(crafted_docs): np.testing.assert_equal(crafted_doc['blob'], np.array([[i] * 5, [i] * 5])) np.testing.assert_equal(crafted_doc['embedding'], np.array([i] * 5))
def test_get_content(stack, num_rows, field): batch_size = 10 embed_size = 20 kwargs = {field: np.random.random((num_rows, embed_size))} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) docs.append(Document()) contents, pts = docs.extract_docs(field, stack_contents=stack) if stack: assert isinstance(contents, np.ndarray) assert contents.shape == (batch_size, num_rows, embed_size) else: assert len(contents) == batch_size for content in contents: assert content.shape == (num_rows, embed_size)
def test_get_content_multiple_fields_text(stack, fields): batch_size = 10 kwargs = {field: f'text-{field}' for field in fields} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(*fields, stack_contents=stack) assert len(contents) == len(fields) assert isinstance(contents, list) if stack: assert isinstance(contents[0], np.ndarray) assert isinstance(contents[1], np.ndarray) for content in contents: assert len(content) == batch_size if stack: assert content.shape == (batch_size,)
def test_get_content_multiple_fields_text_buffer(stack, bytes_input): batch_size = 10 fields = ['id', 'buffer'] kwargs = {'id': 'text', 'buffer': bytes_input} docs = DocumentArray([Document(**kwargs) for _ in range(batch_size)]) contents, pts = docs.extract_docs(*fields, stack_contents=stack) assert len(contents) == len(fields) assert isinstance(contents, list) assert len(contents[0]) == batch_size if stack: assert isinstance(contents[0], np.ndarray) assert contents[0].shape == (batch_size,) assert isinstance(contents[1], list) assert isinstance(contents[1][0], bytes) for content in contents: assert len(content) == batch_size