示例#1
0
 def test_chunks_are_chopped_by_byte_size(self):
     self.assertEqual(
         100,
         len(
             list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer()))
         ),
     )
示例#2
0
 def test_chunks_are_chopped_by_chunk_size(self):
     self.assertEquals(
         10,
         len(
             list(
                 helpers._chunk_actions(self.actions, 10, 99999999,
                                        JSONSerializer()))))
示例#3
0
async def bulk(client,
               actions,
               chunk_size=500,
               max_retries=0,
               max_chunk_bytes=100 * 1024 * 1024,
               expand_action_callback=expand_action,
               initial_backoff=2,
               max_backoff=600,
               stats_only=False,
               **kwargs):
    actions = map(expand_action_callback, actions)

    finish_count = 0
    if stats_only:
        fail_datas = 0
    else:
        fail_datas = []

    chunk_action_iter = _chunk_actions(actions, chunk_size, max_chunk_bytes,
                                       client.transport.serializer)

    for bulk_data, bulk_action in chunk_action_iter:
        coroutine = _process_bulk(client, bulk_data, bulk_action, **kwargs)
        count, fails = await _retry_handler(client, coroutine, max_retries,
                                            initial_backoff, max_backoff,
                                            **kwargs)

        finish_count += count
        if stats_only:
            fail_datas += len(fails)
        else:
            fail_datas.extend(fails)

    return finish_count, fail_datas
示例#4
0
 def test_chunks_are_chopped_by_byte_size(self):
     self.assertEquals(
         100,
         len(
             list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer()))
         ),
     )
 def test_chunks_are_chopped_by_byte_size_properly(self):
     max_byte_size = 170
     chunks = list(helpers._chunk_actions(self.actions, 100000, max_byte_size, JSONSerializer()))
     self.assertEquals(25, len(chunks))
     for chunk_data, chunk_actions in chunks:
         chunk = u''.join(chunk_actions)
         chunk = chunk if isinstance(chunk, str) else chunk.encode('utf-8')
         self.assertLessEqual(len(chunk), max_byte_size)
示例#6
0
 def test_chunks_are_chopped_by_byte_size_properly(self):
     max_byte_size = 170
     chunks = list(
         helpers._chunk_actions(self.actions, 100000, max_byte_size,
                                JSONSerializer()))
     self.assertEqual(25, len(chunks))
     for chunk_data, chunk_actions in chunks:
         chunk = u"".join(chunk_actions)
         chunk = chunk if isinstance(chunk, str) else chunk.encode("utf-8")
         self.assertLessEqual(len(chunk), max_byte_size)
示例#7
0
def parallel_bulk(client,
                  actions,
                  thread_count=4,
                  chunk_size=500,
                  max_chunk_bytes=100 * 1024 * 1024,
                  expand_action_callback=es_helpers.expand_action,
                  **kwargs):
    """ es_helpers.parallel_bulk rewritten with imap_fixed_output_buffer
    instead of Pool.imap, which consumed unbounded memory if the generator
    outruns the upload (which usually happens).
    """
    actions = map(expand_action_callback, actions)
    for result in imap_fixed_output_buffer(
            lambda chunk: list(
                es_helpers._process_bulk_chunk(client, chunk, **kwargs)),
            es_helpers._chunk_actions(actions, chunk_size, max_chunk_bytes,
                                      client.transport.serializer),
            threads=thread_count,
    ):
        for item in result:
            yield item
示例#8
0
async def concurrency_bulk(client,
                           actions,
                           concurrency_count=4,
                           chunk_size=500,
                           max_retries=0,
                           max_chunk_bytes=100 * 1024 * 1024,
                           expand_action_callback=expand_action,
                           initial_backoff=2,
                           max_backoff=600,
                           **kwargs):
    async def concurrency_wrapper(action_iter):
        p_count = p_fails = 0
        for bulk_data, bulk_action in action_iter:
            coroutine = _process_bulk(client, bulk_data, bulk_action, **kwargs)
            count, fails = await _retry_handler(client, coroutine, max_retries,
                                                initial_backoff, max_backoff,
                                                **kwargs)
            p_count += count
            p_fails += len(fails)
        return p_count, p_fails

    actions = map(expand_action_callback, actions)
    chunk_action_iter = _chunk_actions(actions, chunk_size, max_chunk_bytes,
                                       client.transport.serializer)

    tasks = []
    for i in range(concurrency_count):
        tasks.append(concurrency_wrapper(chunk_action_iter))

    results = await asyncio.gather(*tasks, loop=client.loop)

    finish_count = 0
    fail_count = 0
    for p_finish, p_fail in results:
        finish_count += p_finish
        fail_count += p_fail

    return finish_count, fail_count
示例#9
0
async def streaming_bulk(client,
                         actions,
                         chunk_size=500,
                         max_chunk_bytes=100 * 1024 * 1024,
                         raise_on_error=True,
                         expand_action_callback=expand_action,
                         raise_on_exception=True,
                         max_retries=0,
                         initial_backoff=2,
                         max_backoff=600,
                         yield_ok=True,
                         *args,
                         **kwargs):
    """
    Streaming bulk consumes actions from the iterable passed in and yields
    results per action. For non-streaming usecases use
    :func:`~elasticsearch.helpers.bulk` which is a wrapper around streaming
    bulk that returns summary information about the bulk operation once the
    entire input is consumed and sent.

    If you specify ``max_retries`` it will also retry any documents that were
    rejected with a ``429`` status code. To do this it will wait (**by calling
    time.sleep which will block**) for ``initial_backoff`` seconds and then,
    every subsequent rejection for the same chunk, for double the time every
    time up to ``max_backoff`` seconds.

    :arg client: instance of :class:`~elasticsearch.Elasticsearch` to use
    :arg actions: iterable containing the actions to be executed
    :arg chunk_size: number of docs in one chunk sent to es (default: 500)
    :arg max_chunk_bytes: the maximum size of the request in bytes (default: 100MB)
    :arg raise_on_error: raise ``BulkIndexError`` containing errors (as `.errors`)
        from the execution of the last chunk when some occur. By default we raise.
    :arg raise_on_exception: if ``False`` then don't propagate exceptions from
        call to ``bulk`` and just report the items that failed as failed.
    :arg expand_action_callback: callback executed on each action passed in,
        should return a tuple containing the action line and the data line
        (`None` if data line should be omitted).
    :arg max_retries: maximum number of times a document will be retried when
        ``429`` is received, set to 0 (default) for no retries on ``429``
    :arg initial_backoff: number of seconds we should wait before the first
        retry. Any subsequent retries will be powers of ``initial_backoff *
        2**retry_number``
    :arg max_backoff: maximum number of seconds a retry will wait
    :arg yield_ok: if set to False will skip successful documents in the output
    """
    actions = map(expand_action_callback, actions)

    for bulk_data, bulk_actions in _chunk_actions(actions, chunk_size,
                                                  max_chunk_bytes,
                                                  client.transport.serializer):

        for attempt in range(max_retries + 1):
            to_retry, to_retry_data = [], []
            if attempt:
                await asyncio.sleep(
                    min(max_backoff, initial_backoff * 2**(attempt - 1)))

            try:
                async for ok, info in _process_bulk_chunk(
                        client, bulk_actions, bulk_data, raise_on_exception,
                        raise_on_error, *args, **kwargs):
                    if not ok:
                        action, info = info.popitem()
                        # retry if retries enabled, we get 429, and we are not
                        # in the last attempt
                        if max_retries \
                                and info['status'] == 429 \
                                and (attempt + 1) <= max_retries:
                            # _process_bulk_chunk expects strings so we need to
                            # re-serialize the data
                            to_retry.extend(
                                map(client.transport.serializer.dumps,
                                    bulk_data))
                            to_retry_data.append(bulk_data)
                        else:
                            yield ok, {action: info}
                    elif yield_ok:
                        yield ok, info

            except TransportError as e:
                # suppress 429 errors since we will retry them
                if attempt == max_retries or e.status_code != 429:
                    raise
            else:
                if not to_retry:
                    break
                # retry only subset of documents that didn't succeed
                bulk_actions, bulk_data = to_retry, to_retry_data