示例#1
0
    async def _restart_recovery(self) -> None:
        consumer = self.app.consumer
        active_tps = self.active_tps
        standby_tps = self.standby_tps
        standby_offsets = self.standby_offsets
        standby_highwaters = self.standby_highwaters
        assigned_active_tps = self.active_tps
        assigned_standby_tps = self.standby_tps
        active_offsets = self.active_offsets
        standby_offsets = self.standby_offsets
        active_highwaters = self.active_highwaters
        while not self.should_stop:
            self.log.dev("WAITING FOR NEXT RECOVERY TO START")
            if await self.wait_for_stopped(self.signal_recovery_start):
                self.signal_recovery_start.clear()
                break  # service was stopped
            self.signal_recovery_start.clear()
            generation_id = self._generation_id
            span: Any = None
            spans: list = []
            tracer: Optional[opentracing.Tracer] = None
            if self.app.tracer:
                tracer = self.app.tracer.get_tracer("_faust")
            if tracer is not None and self._recovery_span:
                span = tracer.start_span("recovery-thread",
                                         child_of=self._recovery_span)
                self.app._span_add_default_tags(span)
                spans.extend([span, self._recovery_span])
            T = traced_from_parent_span(span)

            try:
                await self._wait(T(asyncio.sleep)(self.recovery_delay))

                if not self.tables or self.app.conf.store == URL("aerospike:"):
                    # If there are no tables -- simply resume streams
                    await T(self._resume_streams)(generation_id=generation_id)
                    for _span in spans:
                        finish_span(_span)
                    continue

                self._set_recovery_started()
                self.standbys_pending = True
                # Must flush any buffers before starting rebalance.
                T(self.flush_buffers)()
                producer = cast(_App, self.app)._producer
                if producer is not None:
                    await self._wait(
                        T(producer.flush)(),
                        timeout=self.app.conf.broker_request_timeout,
                    )

                self.log.dev("Build highwaters for active partitions")
                await self._wait(
                    T(self._build_highwaters)(consumer, assigned_active_tps,
                                              active_highwaters, "active"),
                    timeout=self.app.conf.broker_request_timeout,
                )

                self.log.dev("Build offsets for active partitions")
                await self._wait(
                    T(self._build_offsets)(consumer, assigned_active_tps,
                                           active_offsets, "active"),
                    timeout=self.app.conf.broker_request_timeout,
                )
                if self.app.conf.recovery_consistency_check:
                    for tp in assigned_active_tps:
                        if (active_offsets[tp] and active_highwaters[tp] and
                                active_offsets[tp] > active_highwaters[tp]):
                            raise ConsistencyError(
                                E_PERSISTED_OFFSET.format(
                                    tp,
                                    active_offsets[tp],
                                    active_highwaters[tp],
                                ), )

                self.log.dev("Build offsets for standby partitions")
                await self._wait(
                    T(self._build_offsets)(consumer, assigned_standby_tps,
                                           standby_offsets, "standby"),
                    timeout=self.app.conf.broker_request_timeout,
                )

                self.log.dev("Seek offsets for active partitions")
                await self._wait(
                    T(self._seek_offsets)(consumer, assigned_active_tps,
                                          active_offsets, "active"),
                    timeout=self.app.conf.broker_request_timeout,
                )
                if self.signal_recovery_start.is_set():
                    logger.info("Restarting Recovery")
                    continue

                if self.need_recovery():
                    self._set_recovery_started()
                    self.standbys_pending = True
                    self.log.info("Restoring state from changelog topics...")
                    T(consumer.resume_partitions)(active_tps)
                    # Resume partitions and start fetching.
                    self.log.info("Resuming flow...")
                    T(self.app.flow_control.resume)()
                    T(consumer.resume_flow)()
                    await T(cast(_App, self.app)._fetcher.maybe_start)()

                    # Wait for actives to be up to date.
                    # This signal will be set by _slurp_changelogs
                    if tracer is not None and span:
                        self._actives_span = tracer.start_span(
                            "recovery-actives",
                            child_of=span,
                            tags={"Active-Stats": self.active_stats()},
                        )
                        self.app._span_add_default_tags(span)
                    try:
                        await self._wait(self.signal_recovery_end.wait())
                    except Exception as exc:
                        finish_span(self._actives_span, error=exc)
                    else:
                        finish_span(self._actives_span)
                    finally:
                        self._actives_span = None

                    # recovery done.
                    self.log.info("Done reading from changelog topics")
                    T(consumer.pause_partitions)(active_tps)
                else:
                    self.log.info("Resuming flow...")
                    T(self.app.flow_control.resume)()
                    T(consumer.resume_flow)()
                    self._set_recovery_ended()

                # The changelog partitions only in the active_tps set need to be resumed
                active_only_partitions = active_tps - standby_tps
                if active_only_partitions:
                    # Support for the specific scenario where recovery_buffer=1
                    tps_resuming = [
                        tp for tp in active_only_partitions
                        if self.tp_to_table[tp].recovery_buffer_size == 1
                    ]
                    if tps_resuming:
                        T(consumer.resume_partitions)(tps_resuming)
                        T(self.app.flow_control.resume)()
                        T(consumer.resume_flow)()

                self.log.info("Recovery complete")
                if span:
                    span.set_tag("Recovery-Completed", True)

                if standby_tps:
                    self.log.info("Starting standby partitions...")

                    self.log.dev("Seek standby offsets")
                    await self._wait(
                        T(self._seek_offsets)(consumer, standby_tps,
                                              standby_offsets, "standby"),
                        timeout=self.app.conf.broker_request_timeout,
                    )

                    self.log.dev("Build standby highwaters")
                    await self._wait(
                        T(self._build_highwaters)(
                            consumer,
                            standby_tps,
                            standby_highwaters,
                            "standby",
                        ),
                        timeout=self.app.conf.broker_request_timeout,
                    )
                    if self.app.conf.recovery_consistency_check:
                        for tp in standby_tps:
                            if (standby_offsets[tp] and standby_highwaters[tp]
                                    and standby_offsets[tp] >
                                    standby_highwaters[tp]):
                                raise ConsistencyError(
                                    E_PERSISTED_OFFSET.format(
                                        tp,
                                        standby_offsets[tp],
                                        standby_highwaters[tp],
                                    ), )

                    if tracer is not None and span:
                        self._standbys_span = tracer.start_span(
                            "recovery-standbys",
                            child_of=span,
                            tags={"Standby-Stats": self.standby_stats()},
                        )
                        self.app._span_add_default_tags(span)
                    self.log.dev("Resume standby partitions")
                    T(consumer.resume_partitions)(standby_tps)
                    T(self.app.flow_control.resume)()
                    T(consumer.resume_flow)()

                # Pause all our topic partitions,
                # to make sure we don't fetch any more records from them.
                await self._wait(T(self.on_recovery_completed)(generation_id))
            except RebalanceAgain as exc:
                self.log.dev("RAISED REBALANCE AGAIN")
                for _span in spans:
                    finish_span(_span, error=exc)
                continue  # another rebalance started
            except IllegalStateError as exc:
                self.log.dev("RAISED REBALANCE AGAIN")
                for _span in spans:
                    finish_span(_span, error=exc)
                continue  # another rebalance started
            except ServiceStopped as exc:
                self.log.dev("RAISED SERVICE STOPPED")
                for _span in spans:
                    finish_span(_span, error=exc)
                break  # service was stopped
            except Exception as exc:
                for _span in spans:
                    finish_span(_span, error=exc)
                raise
            else:
                for _span in spans:
                    finish_span(_span)
示例#2
0
    async def _restart_recovery(self) -> None:
        consumer = self.app.consumer
        active_tps = self.active_tps
        standby_tps = self.standby_tps
        standby_offsets = self.standby_offsets
        standby_highwaters = self.standby_highwaters
        assigned_active_tps = self.active_tps
        assigned_standby_tps = self.standby_tps
        active_offsets = self.active_offsets
        standby_offsets = self.standby_offsets
        active_highwaters = self.active_highwaters

        while not self.should_stop:
            self.log.dev('WAITING FOR NEXT RECOVERY TO START')
            self.signal_recovery_reset.clear()
            self.in_recovery = False
            if await self.wait_for_stopped(self.signal_recovery_start):
                self.signal_recovery_start.clear()
                break  # service was stopped
            self.signal_recovery_start.clear()

            span: Any = None
            spans: list = []
            tracer: Optional[opentracing.Tracer] = None
            if self.app.tracer:
                tracer = self.app.tracer.get_tracer('_faust')
            if tracer is not None and self._recovery_span:
                span = tracer.start_span('recovery-thread',
                                         child_of=self._recovery_span)
                self.app._span_add_default_tags(span)
                spans.extend([span, self._recovery_span])
            T = traced_from_parent_span(span)

            try:
                await self._wait(T(asyncio.sleep)(self.recovery_delay))

                if not self.tables:
                    # If there are no tables -- simply resume streams
                    await T(self._resume_streams)()
                    for _span in spans:
                        finish_span(_span)
                    continue

                self.in_recovery = True
                self.standbys_pending = True
                # Must flush any buffers before starting rebalance.
                T(self.flush_buffers)()
                producer = cast(_App, self.app)._producer
                if producer is not None:
                    await self._wait(T(producer.flush)())

                self.log.dev('Build highwaters for active partitions')
                await self._wait(
                    T(self._build_highwaters)(consumer, assigned_active_tps,
                                              active_highwaters, 'active'))

                self.log.dev('Build offsets for active partitions')
                await self._wait(
                    T(self._build_offsets)(consumer, assigned_active_tps,
                                           active_offsets, 'active'))

                for tp in assigned_active_tps:
                    if active_offsets[tp] > active_highwaters[tp]:
                        raise ConsistencyError(
                            E_PERSISTED_OFFSET.format(
                                tp,
                                active_offsets[tp],
                                active_highwaters[tp],
                            ), )

                self.log.dev('Build offsets for standby partitions')
                await self._wait(
                    T(self._build_offsets)(consumer, assigned_standby_tps,
                                           standby_offsets, 'standby'))

                self.log.dev('Seek offsets for active partitions')
                await self._wait(
                    T(self._seek_offsets)(consumer, assigned_active_tps,
                                          active_offsets, 'active'))

                if self.need_recovery():
                    self.log.info('Restoring state from changelog topics...')
                    T(consumer.resume_partitions)(active_tps)
                    # Resume partitions and start fetching.
                    self.log.info('Resuming flow...')
                    T(consumer.resume_flow)()
                    await T(cast(_App, self.app)._fetcher.maybe_start)()
                    T(self.app.flow_control.resume)()

                    # Wait for actives to be up to date.
                    # This signal will be set by _slurp_changelogs
                    if tracer is not None and span:
                        self._actives_span = tracer.start_span(
                            'recovery-actives',
                            child_of=span,
                            tags={'Active-Stats': self.active_stats()},
                        )
                        self.app._span_add_default_tags(span)
                    try:
                        self.signal_recovery_end.clear()
                        await self._wait(self.signal_recovery_end)
                    except Exception as exc:
                        finish_span(self._actives_span, error=exc)
                    else:
                        finish_span(self._actives_span)
                    finally:
                        self._actives_span = None

                    # recovery done.
                    self.log.info('Done reading from changelog topics')
                    T(consumer.pause_partitions)(active_tps)
                else:
                    self.log.info('Resuming flow...')
                    T(consumer.resume_flow)()
                    T(self.app.flow_control.resume)()

                self.log.info('Recovery complete')
                if span:
                    span.set_tag('Recovery-Completed', True)
                self.in_recovery = False

                if standby_tps:
                    self.log.info('Starting standby partitions...')

                    self.log.dev('Seek standby offsets')
                    await self._wait(
                        T(self._seek_offsets)(consumer, standby_tps,
                                              standby_offsets, 'standby'))

                    self.log.dev('Build standby highwaters')
                    await self._wait(
                        T(self._build_highwaters)(
                            consumer,
                            standby_tps,
                            standby_highwaters,
                            'standby',
                        ), )

                    for tp in standby_tps:
                        if standby_offsets[tp] > standby_highwaters[tp]:
                            raise ConsistencyError(
                                E_PERSISTED_OFFSET.format(
                                    tp,
                                    standby_offsets[tp],
                                    standby_highwaters[tp],
                                ), )

                    if tracer is not None and span:
                        self._standbys_span = tracer.start_span(
                            'recovery-standbys',
                            child_of=span,
                            tags={'Standby-Stats': self.standby_stats()},
                        )
                        self.app._span_add_default_tags(span)
                    self.log.dev('Resume standby partitions')
                    T(consumer.resume_partitions)(standby_tps)

                # Pause all our topic partitions,
                # to make sure we don't fetch any more records from them.
                await self._wait(asyncio.sleep(0.1))  # still needed?
                await self._wait(T(self.on_recovery_completed)())
            except RebalanceAgain as exc:
                self.log.dev('RAISED REBALANCE AGAIN')
                for _span in spans:
                    finish_span(_span, error=exc)
                continue  # another rebalance started
            except ServiceStopped as exc:
                self.log.dev('RAISED SERVICE STOPPED')
                for _span in spans:
                    finish_span(_span, error=exc)
                break  # service was stopped
            except Exception as exc:
                for _span in spans:
                    finish_span(_span, error=exc)
                raise
            else:
                for _span in spans:
                    finish_span(_span)
            # restart - wait for next rebalance.
        self.in_recovery = False
示例#3
0
    async def _restart_recovery(self) -> None:
        consumer = self.app.consumer
        active_tps = self.active_tps
        standby_tps = self.standby_tps
        standby_offsets = self.standby_offsets
        standby_highwaters = self.standby_highwaters
        assigned_active_tps = self.active_tps
        assigned_standby_tps = self.standby_tps
        active_offsets = self.active_offsets
        standby_offsets = self.standby_offsets
        active_highwaters = self.active_highwaters

        while not self.should_stop:
            self.log.dev('WAITING FOR NEXT RECOVERY TO START')
            self.signal_recovery_reset.clear()
            self.in_recovery = False
            if await self.wait_for_stopped(self.signal_recovery_start):
                self.signal_recovery_start.clear()
                break  # service was stopped
            self.signal_recovery_start.clear()

            try:
                await self._wait(asyncio.sleep(self.recovery_delay))

                if not self.tables:
                    # If there are no tables -- simply resume streams
                    await self._resume_streams()
                    continue

                self.in_recovery = True
                self.standbys_pending = True
                # Must flush any buffers before starting rebalance.
                self.flush_buffers()
                await self._wait(self.app._producer.flush())

                self.log.dev('Build highwaters for active partitions')
                await self._wait(self._build_highwaters(
                    consumer, assigned_active_tps,
                    active_highwaters, 'active'))

                self.log.dev('Build offsets for active partitions')
                await self._wait(self._build_offsets(
                    consumer, assigned_active_tps, active_offsets, 'active'))

                for tp in assigned_active_tps:
                    if active_offsets[tp] > active_highwaters[tp]:
                        raise ConsistencyError(
                            E_PERSISTED_OFFSET.format(
                                tp,
                                active_offsets[tp],
                                active_highwaters[tp],
                            ),
                        )

                self.log.dev('Build offsets for standby partitions')
                await self._wait(self._build_offsets(
                    consumer, assigned_standby_tps,
                    standby_offsets, 'standby'))

                self.log.dev('Seek offsets for active partitions')
                await self._wait(self._seek_offsets(
                    consumer, assigned_active_tps, active_offsets, 'active'))

                if self.need_recovery():
                    self.log.info('Restoring state from changelog topics...')
                    consumer.resume_partitions(active_tps)
                    # Resume partitions and start fetching.
                    self.log.info('Resuming flow...')
                    consumer.resume_flow()
                    await self.app._fetcher.maybe_start()
                    self.app.flow_control.resume()

                    # Wait for actives to be up to date.
                    # This signal will be set by _slurp_changelogs
                    self.signal_recovery_end.clear()
                    await self._wait(self.signal_recovery_end)

                    # recovery done.
                    self.log.info('Done reading from changelog topics')
                    consumer.pause_partitions(active_tps)
                else:
                    self.log.info('Resuming flow...')
                    consumer.resume_flow()
                    self.app.flow_control.resume()

                self.log.info('Recovery complete')
                self.in_recovery = False

                if standby_tps:
                    self.log.info('Starting standby partitions...')

                    self.log.dev('Seek standby offsets')
                    await self._wait(
                        self._seek_offsets(
                            consumer, standby_tps, standby_offsets, 'standby'))

                    self.log.dev('Build standby highwaters')
                    await self._wait(
                        self._build_highwaters(
                            consumer,
                            standby_tps,
                            standby_highwaters,
                            'standby',
                        ),
                    )

                    for tp in standby_tps:
                        if standby_offsets[tp] > standby_highwaters[tp]:
                            raise ConsistencyError(
                                E_PERSISTED_OFFSET.format(
                                    tp,
                                    standby_offsets[tp],
                                    standby_highwaters[tp],
                                ),
                            )

                    self.log.dev('Resume standby partitions')
                    consumer.resume_partitions(standby_tps)

                # Pause all our topic partitions,
                # to make sure we don't fetch any more records from them.
                await self._wait(asyncio.sleep(0.1))  # still needed?
                await self._wait(self.on_recovery_completed())
            except RebalanceAgain:
                self.log.dev('RAISED REBALANCE AGAIN')
                continue  # another rebalance started
            except ServiceStopped:
                self.log.dev('RAISED SERVICE STOPPED')
                break  # service was stopped
            # restart - wait for next rebalance.
        self.in_recovery = False