Exemplo n.º 1
0
    def reschedule_routers_from_down_agents(self):
        """Reschedule routers from down l3 agents if admin state is up."""

        # give agents extra time to handle transient failures
        agent_dead_limit = cfg.CONF.agent_down_time * 2

        # check for an abrupt clock change since last check. if a change is
        # detected, sleep for a while to let the agents check in.
        tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
                                              timeutils.utcnow())
        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
            LOG.warn(
                _LW("Time since last L3 agent reschedule check has "
                    "exceeded the interval between checks. Waiting "
                    "before check to allow agents to send a heartbeat "
                    "in case there was a clock adjustment."))
            time.sleep(agent_dead_limit)
        self._clock_jump_canary = timeutils.utcnow()

        context = n_ctx.get_admin_context()
        cutoff = timeutils.utcnow() - datetime.timedelta(
            seconds=agent_dead_limit)
        down_bindings = (context.session.query(RouterL3AgentBinding).join(
            agents_db.Agent).filter(
                agents_db.Agent.heartbeat_timestamp < cutoff,
                agents_db.Agent.admin_state_up
            ).outerjoin(
                l3_attrs_db.RouterExtraAttributes,
                l3_attrs_db.RouterExtraAttributes.router_id ==
                RouterL3AgentBinding.router_id).filter(
                    sa.or_(
                        l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
                        l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
        try:
            for binding in down_bindings:
                LOG.warn(
                    _LW("Rescheduling router %(router)s from agent %(agent)s "
                        "because the agent did not report to the server in "
                        "the last %(dead_time)s seconds."), {
                            'router': binding.router_id,
                            'agent': binding.l3_agent_id,
                            'dead_time': agent_dead_limit
                        })
                try:
                    self.reschedule_router(context, binding.router_id)
                except (l3agentscheduler.RouterReschedulingFailed,
                        messaging.RemoteError):
                    # Catch individual router rescheduling errors here
                    # so one broken one doesn't stop the iteration.
                    LOG.exception(_LE("Failed to reschedule router %s"),
                                  binding.router_id)
        except db_exc.DBError:
            # Catch DB errors here so a transient DB connectivity issue
            # doesn't stop the loopingcall.
            LOG.exception(
                _LE("Exception encountered during router "
                    "rescheduling."))
Exemplo n.º 2
0
    def reschedule_routers_from_down_agents(self):
        """Reschedule routers from down l3 agents if admin state is up."""

        # give agents extra time to handle transient failures
        agent_dead_limit = cfg.CONF.agent_down_time * 2

        # check for an abrupt clock change since last check. if a change is
        # detected, sleep for a while to let the agents check in.
        tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
                                              timeutils.utcnow())
        if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
            LOG.warn(_LW("Time since last L3 agent reschedule check has "
                         "exceeded the interval between checks. Waiting "
                         "before check to allow agents to send a heartbeat "
                         "in case there was a clock adjustment."))
            time.sleep(agent_dead_limit)
        self._clock_jump_canary = timeutils.utcnow()

        context = n_ctx.get_admin_context()
        cutoff = timeutils.utcnow() - datetime.timedelta(
            seconds=agent_dead_limit)
        down_bindings = (
            context.session.query(RouterL3AgentBinding).
            join(agents_db.Agent).
            filter(agents_db.Agent.heartbeat_timestamp < cutoff,
                   agents_db.Agent.admin_state_up).
            outerjoin(l3_attrs_db.RouterExtraAttributes,
                      l3_attrs_db.RouterExtraAttributes.router_id ==
                      RouterL3AgentBinding.router_id).
            filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
                          l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
        try:
            for binding in down_bindings:
                LOG.warn(_LW(
                    "Rescheduling router %(router)s from agent %(agent)s "
                    "because the agent did not report to the server in "
                    "the last %(dead_time)s seconds."),
                    {'router': binding.router_id,
                     'agent': binding.l3_agent_id,
                     'dead_time': agent_dead_limit})
                try:
                    self.reschedule_router(context, binding.router_id)
                except (l3agentscheduler.RouterReschedulingFailed,
                        messaging.RemoteError):
                    # Catch individual router rescheduling errors here
                    # so one broken one doesn't stop the iteration.
                    LOG.exception(_LE("Failed to reschedule router %s"),
                                  binding.router_id)
        except db_exc.DBError:
            # Catch DB errors here so a transient DB connectivity issue
            # doesn't stop the loopingcall.
            LOG.exception(_LE("Exception encountered during router "
                              "rescheduling."))
Exemplo n.º 3
0
    def _is_exact_match(cron, ts):
        """Handle edge in case when both parameters are equal.

        Handle edge case where if the timestamp is the same as the
        cron point in time to the minute, croniter returns the previous
        start, not the current. We can check this by first going one
        step back and then one step forward and check if we are
        at the original point in time.
        """
        cron.get_prev()
        diff = timeutils.total_seconds(ts - cron.get_next(datetime.datetime))
        return abs(diff) < 60  # minute precision
Exemplo n.º 4
0
 def wait_down_agents(self, agent_type, agent_dead_limit):
     """Gives chance for agents to send a heartbeat."""
     # check for an abrupt clock change since last check. if a change is
     # detected, sleep for a while to let the agents check in.
     tdelta = timeutils.utcnow() - getattr(self, '_clock_jump_canary',
                                           timeutils.utcnow())
     if timeutils.total_seconds(tdelta) > cfg.CONF.agent_down_time:
         LOG.warn(_LW("Time since last %s agent reschedule check has "
                      "exceeded the interval between checks. Waiting "
                      "before check to allow agents to send a heartbeat "
                      "in case there was a clock adjustment."), agent_type)
         time.sleep(agent_dead_limit)
     self._clock_jump_canary = timeutils.utcnow()
Exemplo n.º 5
0
 def test_total_seconds(self):
     delta = datetime.timedelta(days=1, hours=2, minutes=3, seconds=4.5)
     self.assertAlmostEquals(93784.5,
                             timeutils.total_seconds(delta))
Exemplo n.º 6
0
def service_is_up(service):
    """Check whether a service is up based on last heartbeat."""
    last_heartbeat = service['updated_at'] or service['created_at']
    # Timestamps in DB are UTC.
    elapsed = timeutils.total_seconds(timeutils.utcnow() - last_heartbeat)
    return abs(elapsed) <= CONF.service_down_time
Exemplo n.º 7
0
 def test_total_seconds(self):
     delta = datetime.timedelta(days=1, hours=2, minutes=3, seconds=4.5)
     self.assertAlmostEquals(93784.5, timeutils.total_seconds(delta))
Exemplo n.º 8
0
def service_is_up(service):
    """Check whether a service is up based on last heartbeat."""
    last_heartbeat = service['updated_at'] or service['created_at']
    # Timestamps in DB are UTC.
    elapsed = timeutils.total_seconds(timeutils.utcnow() - last_heartbeat)
    return abs(elapsed) <= CONF.service_down_time