Python DocumentArrayMemmap示例，jina.types.arrays.memmap.DocumentArrayMemmap Python示例

示例#1

0

显示文件

文件： test_buffer.py 项目： florian-hoenicke/jina

def test_buffer_dam_add_or_update(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=6)
    docs = list(random_docs(8))
    dam.extend(docs[:5])

    doc1 = docs[0]
    doc1.content = 'new'

    # doc1 already exists => update
    dam.buffer_pool.add_or_update(doc1.id, doc1)
    assert dam[0].content == doc1.content
    assert len(dam.buffer_pool.buffer) == 5

    # doc does not exist => add to buffer
    dam.buffer_pool.add_or_update(docs[5].id, docs[5])
    assert len(dam.buffer_pool.buffer) == 6

    # buffer is full => remove the LRU (docs[1], because docs[0] was used before)
    dam.buffer_pool.add_or_update(docs[6].id, docs[6])
    assert docs[6].id in dam.buffer_pool
    assert docs[1].id not in dam.buffer_pool

    del dam.buffer_pool[docs[4].id]

    # spot number 4 becomes empty
    assert 4 in dam.buffer_pool._empty
    dam.buffer_pool.add_or_update(docs[7].id, docs[7])
    assert dam.buffer_pool.doc_map[docs[7].id][0] == 4

示例#2

0

显示文件

文件： my_executors.py 项目： JoanFM/jina

class MyIndexer(Executor):
    """Simple indexer class """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/indexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=1,
        )

示例#3

0

显示文件

def test_setter_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    tags = [{'1': 2}]

    with pytest.raises(ValueError, match='the number of tags in the'):
        dam.tags = tags

示例#4

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_blobs_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for x in range(100)])
    blobs = np.ones((2, 10, 10))

    with pytest.raises(ValueError):
        dam.blobs = blobs

示例#5

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_texts_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    texts = ['hello']

    with pytest.raises(ValueError):
        dam.texts = texts

示例#6

0

显示文件

文件： my_executors.py 项目： vishalbelsare/jina

class MyIndexer(Executor):
    """
    Executor with basic exact search using cosine distance
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/indexer')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        """Extend self._docs

        :param docs: DocumentArray containing Documents
        :param kwargs: other keyword arguments
        """
        self._docs.extend(docs)

    @requests(on=['/search', '/eval'])
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        """Append best matches to each document in docs

        :param docs: documents that are searched
        :param parameters: dictionary of pairs (parameter,value)
        :param kwargs: other keyword arguments
        """
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=int(parameters['top_k']),
        )

示例#7

0

显示文件

def test_scipy_dist(docarrays_for_embedding_distance_computation,
                    normalization, metric, tmpdir, only_id):
    D1, D2 = docarrays_for_embedding_distance_computation
    D1_ = copy.deepcopy(D1)
    D2_ = copy.deepcopy(D2)
    D1.match(D2,
             metric=metric,
             limit=3,
             normalization=normalization,
             use_scipy=True)
    values_docarray = [m.scores[metric].value for d in D1 for m in d.matches]

    D2memmap = DocumentArrayMemmap(tmpdir)
    D2memmap.extend(D2_)
    D1_.match(
        D2memmap,
        metric=metric,
        limit=3,
        normalization=normalization,
        use_scipy=True,
        only_id=only_id,
    )
    values_docarraymemmap = [
        m.scores[metric].value for d in D1_ for m in d.matches
    ]

    np.testing.assert_equal(values_docarray, values_docarraymemmap)

示例#8

0

显示文件

def test_match_handle_different_limit(get_two_docarray, limit, tmpdir):
    da1, da2 = get_two_docarray
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(da2)
    da1.match(dam, limit=limit)
    expected_length = limit if limit not in [None, -1] else len(da2)
    assert len(da1[0].matches) == expected_length

示例#9

0

显示文件

def test_embeddings_wrong_len(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for x in range(100)])
    embeddings = np.ones((2, 10, 10))

    with pytest.raises(ValueError, match='the number of rows in the'):
        dam.embeddings = embeddings

示例#10

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_texts_getter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document(text='hello') for _ in range(100)])
    assert len(dam.texts) == 100
    t1 = dam.texts
    t2 = dam.get_attributes('text')
    assert t1 == t2

示例#11

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_blobs_setter_dam(tmpdir):
    blobs = np.random.random((100, 10, 10))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in blobs])
    dam.blobs = blobs
    np.testing.assert_almost_equal(dam.blobs, blobs)
    for x, doc in zip(blobs, dam):
        np.testing.assert_almost_equal(x, doc.blob)

示例#12

0

显示文件

文件： test_buffer.py 项目： florian-hoenicke/jina

def test_buffer_dam_clear(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5)
    docs = list(random_docs(5))
    dam.extend(docs)

    dam.buffer_pool.clear()
    for doc in docs:
        assert doc.id not in dam.buffer_pool

示例#13

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_traverse(tmpdir, mocker):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(random_docs(100))
    mock = mocker.Mock()
    for c in dam.traverse_flat(['c']):
        assert c.granularity == 1
        mock()
    mock.assert_called()

示例#14

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_embeddings_setter_dam(tmpdir):
    emb = np.random.random((100, 128))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    dam.embeddings = emb
    np.testing.assert_almost_equal(dam.embeddings, emb)

    for x, doc in zip(emb, dam):
        np.testing.assert_almost_equal(x, doc.embedding)

示例#15

0

显示文件

def test_tags_setter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    tags = [{'a': 2, 'c': 'd'} for _ in range(100)]
    dam.extend([Document() for _ in range(100)])
    dam.tags = tags
    assert dam.tags == tags

    for x, doc in zip(tags, dam):
        assert x == doc.tags

示例#16

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_memmap_update_in_memory(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=100)
    candidates = list(random_docs(100))
    dam.extend(candidates)
    for idx, candidate in enumerate(candidates):
        candidate.content = f'new content {idx}'

    for idx, doc in enumerate(dam):
        assert doc.content == f'new content {idx}'

示例#17

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_texts_setter_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend([Document() for _ in range(100)])
    texts = ['text' for _ in range(100)]
    dam.texts = texts
    assert dam.texts == texts

    for x, doc in zip(texts, dam):
        assert x == doc.text

示例#18

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_memmap_update_document(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    candidates = list(random_docs(100))
    dam.extend(candidates)
    for idx, candidate in enumerate(candidates):
        candidate.content = f'new content {idx}'
        dam[idx] = candidate

    for idx, doc in enumerate(dam):
        assert doc.content == f'new content {idx}'

示例#19

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_memmap_buffer_synched(tmpdir):
    docs = list(random_docs(100))
    dam = DocumentArrayMemmap(tmpdir)
    dam.extend(docs[:50])

    for i, doc in enumerate(docs[50:]):
        dam[i] = doc
        assert dam._buffer_pool[doc.id].id == dam[i].id
        doc.content = 'new'
        assert dam[doc.id].content == 'new'

示例#20

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_error(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    dam.clear()
    with pytest.raises(KeyError):
        dam['12']
    with pytest.raises(IndexError):
        dam[1]
    with pytest.raises(IndexError):
        del dam[1]
    with pytest.raises(KeyError):
        del dam['12']

示例#21

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_shuffle(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(100))
    da.extend(docs)
    shuffled = da.shuffle()
    assert len(shuffled) == len(da)
    assert isinstance(shuffled, DocumentArray)
    ids_before_shuffle = [d.id for d in da]
    ids_after_shuffle = [d.id for d in shuffled]
    assert ids_before_shuffle != ids_after_shuffle
    assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle)

示例#22

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_convert_dm_to_dam(tmpdir, mocker):
    dam = DocumentArrayMemmap(tmpdir)
    da = DocumentArray(random_docs(100))
    dam.extend(da)
    da.clear()
    mock = mocker.Mock()
    for d in dam:
        assert d
        mock()
    mock.assert_called()
    assert len(da) == 0
    assert len(dam) == 100

示例#23

0

显示文件

文件： test_buffer.py 项目： florian-hoenicke/jina

def test_buffer_dam_delete(tmpdir):
    dam = DocumentArrayMemmap(tmpdir, buffer_pool_size=5)
    docs = list(random_docs(6))
    dam.extend(docs)

    first_doc = docs[0]

    # the first element should be out of buffer
    with pytest.raises(KeyError):
        del dam.buffer_pool[first_doc.id]

    # no exception raised
    dam.buffer_pool.delete_if_exists(first_doc.id)

示例#24

0

显示文件

def doc_lists_to_doc_arrays(doc_lists, tmpdir, first_memmap, second_memmap,
                            buffer_pool_size):
    doc_list1, doc_list2 = doc_lists

    tmpdir1, tmpdir2 = tmpdir / '1', tmpdir / '2'

    D1 = (DocumentArray() if not first_memmap else DocumentArrayMemmap(
        tmpdir1, buffer_pool_size=buffer_pool_size))
    D1.extend(doc_list1)
    D2 = (DocumentArray() if not second_memmap else DocumentArrayMemmap(
        tmpdir2, buffer_pool_size=buffer_pool_size))
    D2.extend(doc_list2)
    return D1, D2

示例#25

0

显示文件

文件： test_helper.py 项目： JoanFM/jina

def test_batch_iterator_dam(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    for i in range(4):
        dam.append(Document(id=i))
    bi = batch_iterator(dam, 2)
    expected_iterator = iter(range(4))
    for batch in bi:
        for doc in batch:
            assert int(doc.id) == next(expected_iterator)

    # expect that expected_iterator is totally consumed
    with pytest.raises(StopIteration):
        next(expected_iterator)

示例#26

0

显示文件

文件： test_buffer.py 项目： florian-hoenicke/jina

def test_buffer_dam_getitem(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    docs = list(random_docs(10))
    dam.extend(docs)
    for i, doc in enumerate(docs):
        # assert same doc when getting by key
        assert dam.buffer_pool[doc.id].content_hash == doc.content_hash
        assert dam.buffer_pool[doc.id].id == doc.id

    with pytest.raises(TypeError):
        dam.buffer_pool[1:5]

    with pytest.raises(TypeError):
        dam.buffer_pool[0]

示例#27

0

显示文件

class KeyValueIndexer(Executor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + '/kv-idx')

    @requests(on='/index')
    def index(self, docs: DocumentArray, **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def query(self, docs: DocumentArray, **kwargs):
        for doc in docs:
            for match in doc.matches:
                extracted_doc = self._docs[match.parent_id]
                match.update(extracted_doc)

示例#28

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def memmap_for_split(tmpdir):
    da = DocumentArrayMemmap(tmpdir)
    da.append(Document(tags={'category': 'c'}))
    da.append(Document(tags={'category': 'c'}))
    da.append(Document(tags={'category': 'b'}))
    da.append(Document(tags={'category': 'a'}))
    da.append(Document(tags={'category': 'a'}))
    return da

示例#29

0

显示文件

文件： test_memmap.py 项目： paddlelaw/jina

def test_memmap_delete_by_slice(tmpdir):
    dam = DocumentArrayMemmap(tmpdir)
    candidates = list(random_docs(100))
    for d in candidates:
        d.id = f'id_{d.id}'
    dam.extend(candidates)
    assert len(dam) == 100
    del dam[-5:]
    assert len(dam) == 95
    del dam[:5]
    assert len(dam) == 90

    for candidate in candidates[:5] + candidates[-5:]:
        for d in dam:
            assert d.id != candidate.id

示例#30

0

显示文件

文件： my_executors.py 项目： paddlelaw/jina

class DocVectorIndexer(Executor):
    def __init__(self, index_file_name: str, **kwargs):
        super().__init__(**kwargs)
        self._docs = DocumentArrayMemmap(self.workspace + f'/{index_file_name}')

    @requests(on='/index')
    def index(self, docs: 'DocumentArray', **kwargs):
        self._docs.extend(docs)

    @requests(on='/search')
    def search(self, docs: 'DocumentArray', parameters: Dict, **kwargs):
        docs.match(
            self._docs,
            metric='cosine',
            normalization=(1, 0),
            limit=int(parameters['top_k']),
        )