async def _chunk(self, actions, chunk_size, params): futures = [] async with stream.chunks(actions, chunk_size).stream() as chunks: async for chunk in chunks: logger.debug('Elasticsearch bulk chunk size: %d' % len(chunk)) for future in [future for future in futures if future.done()]: # return all done future's result yield future.result() # remove future from futures list after yield result futures.remove(future) logger.debug('Elasticsearch async helper bulk semaphore value: %d' % self.semaphore._value) await self.semaphore.acquire() future = async_helpers.bulk(client=self.es, actions=chunk, chunk_size=self.chunk_size, max_retries=3, initial_backoff=0.3, max_backoff=3, params=params, semaphore=self.semaphore) futures.append(asyncio.ensure_future(future)) # await for all future complete if futures: done, _ = await asyncio.wait(futures) for future in done: yield future.result()
async def _f(self, pop_paths, cursor, batch_size): async with stream.chunks(cursor, batch_size).stream() as chunks: async for chunk in chunks: l_docs = chunk for pop_path in pop_paths: l_docs = await self._populate(l_docs, pop_path) for doc in l_docs: yield doc
async def run_until_complete( self, iterable: InputSequenceType, n_total: Optional[int] = None, ) -> Any: try: self._n_total = n_total or len(iterable) # type: ignore except Exception: pass assert self._start_time is None # can't reuse Job instances self._start_time = time.time() try: chunk_stream = stream.chunks(stream.iterate(iterable), self.chunk_size) async with self.mapper as mapper_url, chunk_stream.stream( ) as chunk_gen: async for response in utils.limited_as_completed_from_async_coro_gen( (self._request(mapper_url, chunk) async for chunk in chunk_gen), self.mapper.n_mappers, ): response_tuple = await response self._reduce_chunk(*response_tuple) if self._n_total is None: self._n_total = self._n_successful + self._n_failed else: assert self._n_total == self._n_successful + self._n_failed self.reducer.finish() return self.result finally: self._end_time = time.time() if self.owns_session: await self.session.close()
async def merge_graph_process( db: GraphDB, event_sender: AnalyticsEventSender, args: Namespace, content: AsyncGenerator[Union[bytes, Json], None], max_wait: timedelta, maybe_batch: Optional[str], ) -> GraphUpdate: change_id = maybe_batch if maybe_batch else uuid_str() write = Queue() # type: ignore read = Queue() # type: ignore updater = DbUpdaterProcess( write, read, args) # the process reads from our write queue and vice versa stale = timedelta(seconds=5).total_seconds( ) # consider dead communication after this amount of time deadline = utc() + max_wait dead_adjusted = False async def send_to_child(pa: ProcessAction) -> bool: alive = updater.is_alive() if alive: await run_async(write.put, pa, True, stale) return alive def read_results() -> Task: # type: ignore # pypy async def read_forever() -> GraphUpdate: nonlocal deadline nonlocal dead_adjusted while utc() < deadline: # After exit of updater: adjust the deadline once if not updater.is_alive() and not dead_adjusted: log.debug("Import process done or dead. Adjust deadline.") deadline = utc() + timedelta(seconds=30) dead_adjusted = True try: action = await run_async(read.get, True, stale) if isinstance(action, EmitAnalyticsEvent): await event_sender.capture(action.event) elif isinstance(action, Result): return action.get_value() except Empty: # empty is fine pass raise ImportAborted( f"Import process died. (ExitCode: {updater.exitcode})") return asyncio.create_task(read_forever()) task: Optional[Task] = None # type: ignore # pypy result: Optional[GraphUpdate] = None try: reset_process_start_method( ) # other libraries might have tampered the value in the mean time updater.start() task = read_results() # concurrently read result queue chunked: Stream = stream.chunks(content, BatchSize) async with chunked.stream() as streamer: # pylint: disable=no-member async for lines in streamer: if not await send_to_child(ReadElement(lines)): # in case the child is dead, we should stop break await send_to_child( MergeGraph(db.name, change_id, maybe_batch is not None)) result = cast(GraphUpdate, await task) # wait for final result return result finally: if task is not None and not task.done(): task.cancel() if not result: # make sure the change is aborted in case of transaction log.info(f"Abort update manually: {change_id}") await db.abort_update(change_id) await send_to_child(PoisonPill()) await run_async(updater.join, stale) if updater.is_alive(): log.warning( f"Process is still alive after poison pill. Terminate process {updater.pid}" ) with suppress(Exception): updater.terminate() await asyncio.sleep(3) if updater.is_alive(): log.warning( f"Process is still alive after terminate. Kill process {updater.pid}" ) with suppress(Exception): updater.kill() await asyncio.sleep(3) if not updater.is_alive(): with suppress(Exception): updater.close()