def stop(self, worker_context): """Attempts to destroy the instance with configured timeout. :param worker_context: :returns: """ self._ensure_cache(worker_context) self.log.info(_LI('Destroying instance')) if not self.instance_info: self.log.info(_LI('Instance already destroyed.')) return try: worker_context.nova_client.destroy_instance(self.instance_info) except Exception: self.log.exception(_LE('Error deleting router instance')) start = time.time() while time.time() - start < cfg.CONF.boot_timeout: if not worker_context.nova_client.\ get_instance_by_id(self.instance_info.id_): if self.state != states.GONE: self.state = states.DOWN return self.state self.log.debug('Router has not finished stopping') time.sleep(cfg.CONF.retry_delay) self.log.error(_LE( 'Router failed to stop within %d secs'), cfg.CONF.boot_timeout)
def _wait_child(self): try: # Don't block if no child processes have exited pid, status = os.waitpid(0, os.WNOHANG) if not pid: return None except OSError as exc: if exc.errno not in (errno.EINTR, errno.ECHILD): raise return None if os.WIFSIGNALED(status): sig = os.WTERMSIG(status) LOG.info(_LI('Child %(pid)d killed by signal %(sig)d'), dict(pid=pid, sig=sig)) else: code = os.WEXITSTATUS(status) LOG.info(_LI('Child %(pid)s exited with status %(code)d'), dict(pid=pid, code=code)) if pid not in self.children: LOG.warning(_LW('pid %d not in child list'), pid) return None wrap = self.children.pop(pid) wrap.children.remove(pid) return wrap
def send_message(self, message): "Called when the worker put a message in the state machine queue" if self.deleted: # Ignore any more incoming messages self.log.debug("deleted state machine, ignoring incoming message %s", message) return False # NOTE(dhellmann): This check is largely redundant with the # one in CalcAction.transition() but it may allow us to avoid # adding poll events to the queue at all, and therefore cut # down on the number of times a worker thread wakes up to # process something on a router that isn't going to actually # do any work. if message.crud == POLL and self.instance.state == instance_manager.ERROR: self.log.info(_LI("Router status is ERROR, ignoring POLL message: %s"), message) return False if message.crud == REBUILD: if message.body.get("router_image_uuid"): self.log.info(_LI("Router is being REBUILT with custom image %s"), message.body["router_image_uuid"]) self.router_image_uuid = message.body["router_image_uuid"] else: self.router_image_uuid = cfg.CONF.router_image_uuid self._queue.append(message.crud) queue_len = len(self._queue) if queue_len > self._queue_warning_threshold: logger = self.log.warning else: logger = self.log.debug logger(_LW("incoming message brings queue length to %s"), queue_len) return True
def _ensure_local_port(self, network_id, subnet_id, network_type, ip_address): driver = importutils.import_object(self.conf.interface_driver, self.conf) host_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, socket.gethostname())) name = 'AKANDA:RUG:%s' % network_type.upper() query_dict = dict(device_owner=DEVICE_OWNER_RUG, device_id=host_id, name=name, network_id=network_id) ports = self.api_client.list_ports(**query_dict)['ports'] if ports: port = Port.from_dict(ports[0]) LOG.info(_LI('already have local %s port, using %r'), network_type, port) else: LOG.info(_LI('creating a new local %s port'), network_type) port_dict = { 'admin_state_up': True, 'network_id': network_id, 'device_owner': DEVICE_OWNER_ROUTER_INT, # lying here for IP 'name': name, 'device_id': host_id, 'fixed_ips': [{ 'ip_address': ip_address.split('/')[0], 'subnet_id': subnet_id }], 'binding:host_id': socket.gethostname() } port = Port.from_dict( self.api_client.create_port(dict(port=port_dict))['port']) # remove lie that enabled us pick IP on slaac subnet self.api_client.update_port( port.id, {'port': {'device_owner': DEVICE_OWNER_RUG}} ) port.device_owner = DEVICE_OWNER_RUG LOG.info(_LI('new local %s port: %r'), network_type, port) # create the tap interface if it doesn't already exist if not ip_lib.device_exists(driver.get_device_name(port)): driver.plug( port.network_id, port.id, driver.get_device_name(port), port.mac_address) # add sleep to ensure that port is setup before use time.sleep(1) driver.init_l3(driver.get_device_name(port), [ip_address]) return port
def update_state(self, worker_context, silent=False): self._ensure_cache(worker_context) if self.state == GONE: self.log.debug("not updating state of deleted router") return self.state if self.instance_info is None: self.log.debug("no backing instance, marking router as down") self.state = DOWN return self.state addr = self.instance_info.management_address for i in xrange(cfg.CONF.max_retries): if router_api.is_alive(addr, cfg.CONF.akanda_mgt_service_port): if self.state != CONFIGURED: self.state = UP break if not silent: self.log.debug("Alive check failed. Attempt %d of %d", i, cfg.CONF.max_retries) time.sleep(cfg.CONF.retry_delay) else: old_state = self.state self._check_boot_timeout() # If the router isn't responding, make sure Nova knows about it instance = worker_context.nova_client.get_instance_for_obj(self.router_id) if instance is None and self.state != ERROR: self.log.info(_LI("No instance was found; rebooting")) self.state = DOWN self.instance_info = None # update_state() is called from Alive() to check the # status of the router. If we can't talk to the API at # that point, the router should be considered missing and # we should reboot it, so mark it down if we think it was # configured before. if old_state == CONFIGURED and self.state != ERROR: self.log.debug("Did not find router alive, marking it as down") self.state = DOWN # After the router is all the way up, record how long it took # to boot and accept a configuration. if self.instance_info.booting and self.state == CONFIGURED: # If we didn't boot the server (because we were restarted # while it remained running, for example), we won't have a # duration to log. self.instance_info.confirm_up() if self.instance_info.boot_duration: self.log.info( _LI("Router booted in %s seconds after %s attempts"), self.instance_info.boot_duration.total_seconds(), self._boot_counter.count, ) # Always reset the boot counter, even if we didn't boot # the server ourself, so we don't accidentally think we # have an erroring router. self._boot_counter.reset() return self.state
def __init__(self, id_, name, tenant_id, network_id, ip_version, cidr, gateway_ip, enable_dhcp, dns_nameservers, host_routes, ipv6_ra_mode): self.id = id_ self.name = name self.tenant_id = tenant_id self.network_id = network_id self.ip_version = ip_version try: self.cidr = netaddr.IPNetwork(cidr) except (TypeError, netaddr.AddrFormatError) as e: raise ValueError( _('Invalid CIDR %r for subnet %s of network %s: %s') % ( cidr, id_, network_id, e, ) ) try: self.gateway_ip = netaddr.IPAddress(gateway_ip) except (TypeError, netaddr.AddrFormatError) as e: self.gateway_ip = None LOG.info(_LI('Bad gateway_ip on subnet %s: %r (%s)'), id_, gateway_ip, e) self.enable_dhcp = enable_dhcp self.dns_nameservers = dns_nameservers self.host_routes = host_routes self.ipv6_ra_mode = ipv6_ra_mode
def __init__(self, id_, name, tenant_id, network_id, ip_version, cidr, gateway_ip, enable_dhcp, dns_nameservers, host_routes, ipv6_ra_mode): self.id = id_ self.name = name self.tenant_id = tenant_id self.network_id = network_id self.ip_version = ip_version try: self.cidr = netaddr.IPNetwork(cidr) except (TypeError, netaddr.AddrFormatError) as e: raise ValueError( _('Invalid CIDR %r for subnet %s of network %s: %s') % ( cidr, id_, network_id, e, )) try: self.gateway_ip = netaddr.IPAddress(gateway_ip) except (TypeError, netaddr.AddrFormatError) as e: self.gateway_ip = None LOG.info(_LI('Bad gateway_ip on subnet %s: %r (%s)'), id_, gateway_ip, e) self.enable_dhcp = enable_dhcp self.dns_nameservers = dns_nameservers self.host_routes = host_routes self.ipv6_ra_mode = ipv6_ra_mode
def _should_process(self, message): """Determines whether a message should be processed or not.""" global_debug, reason = self.db_api.global_debug() if global_debug: LOG.info('Skipping incoming event, cluster in global debug ' 'mode. (reason: %s)', reason) return False if message.resource.id not in commands.WILDCARDS: message = self._populate_resource_id(message) if not message.resource.id: LOG.info(_LI('Ignoring message with no resource found.')) return False should_ignore, reason = \ self.db_api.tenant_in_debug(message.resource.tenant_id) if should_ignore: LOG.info( 'Ignoring message intended for tenant %s in debug mode ' '(reason: %s): %s', message.resource.tenant_id, reason, message, ) return False should_ignore, reason = self.db_api.resource_in_debug( message.resource.id) if should_ignore: LOG.info( 'Ignoring message intended for resource %s in ' 'debug mode (reason: %s): %s', message.resource.id, reason, message, ) return False return message
def run(self, ip_address, port=cfg.CONF.rug_api_port): app = RugAPI() for i in xrange(5): LOG.info( _LI('Starting the rug-api on %s/%s'), ip_address, port, ) try: sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128) except socket.error as err: if err.errno != 99: # EADDRNOTAVAIL raise LOG.warning(_LW('Could not create rug-api socket: %s'), err) LOG.warning(_LW('Sleeping %s before trying again'), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError( _('Could not establish rug-api socket on %s/%s') % (ip_address, port)) eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
def run(self, ip_address, port=RUG_META_PORT): app = MetadataProxyHandler() for i in xrange(5): LOG.info(_LI('Starting the metadata proxy on %s/%s'), ip_address, port) try: sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128) except socket.error as err: if err.errno != 99: raise LOG.warning(_LW('Could not create metadata proxy socket: %s'), err) LOG.warning(_LW('Sleeping %s before trying again'), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError( _('Could not establish metadata proxy socket on %s/%s') % (ip_address, port)) eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
def _pipe_watcher(self): # This will block until the write end is closed when the parent # dies unexpectedly self.readpipe.read() LOG.info(_LI('Parent process has died unexpectedly, exiting')) sys.exit(1)
def boot(self, worker_context, router_image_uuid): self._ensure_cache(worker_context) if self.state == GONE: self.log.info(_LI("Not booting deleted router")) return self.log.info(_LI("Booting router")) self.state = DOWN self._boot_counter.start() def make_vrrp_ports(): mgt_port = worker_context.neutron.create_management_port(self.router_obj.id) # FIXME(mark): ideally this should be ordered and de-duped instance_ports = [ worker_context.neutron.create_vrrp_port(self.router_obj.id, n) for n in (p.network_id for p in self.router_obj.ports) ] return mgt_port, instance_ports try: # TODO(mark): make this pluggable self._ensure_provider_ports(self.router_obj, worker_context) # TODO(mark): make this handle errors more gracefully on cb fail # TODO(mark): checkout from a pool - boot on demand for now instance_info = worker_context.nova_client.boot_instance( self.instance_info, self.router_obj.id, router_image_uuid, make_vrrp_ports ) if not instance_info: self.log.info(_LI("Previous router is deleting")) # Reset the VM manager, causing the state machine to start # again with a new VM. self.reset_boot_counter() self.instance_info = None return except: self.log.exception(_LE("Router failed to start boot")) # TODO(mark): attempt clean-up of failed ports return else: # We have successfully started a (re)boot attempt so # record the timestamp so we can report how long it takes. self.state = BOOTING self.instance_info = instance_info
def report_status(self, show_config=True): if show_config: cfg.CONF.log_opt_values(LOG, INFO) LOG.info(_LI( 'Number of state machines in work queue: %d'), self.work_queue.qsize() ) LOG.info(_LI( 'Number of tenant resource managers managed: %d'), len(self.tenant_managers) ) for thread in self.threads: LOG.info(_LI( 'Thread %s is %s. Last seen: %s'), thread.name, 'alive' if thread.isAlive() else 'DEAD', self._thread_status.get(thread.name, 'UNKNOWN'), ) debug_tenants = self.db_api.tenants_in_debug() if debug_tenants: for t_uuid, reason in debug_tenants: LOG.info(_LI('Debugging tenant: %s (reason: %s)'), t_uuid, reason) else: LOG.info(_LI('No tenants in debug mode')) debug_resources = self.db_api.resources_in_debug() if debug_resources: for resource_id, reason in debug_resources: LOG.info(_LI('Debugging resource: %s (reason: %s)'), resource_id, reason) else: LOG.info(_LI('No resources in debug mode'))
def _start_child(self, wrap): if len(wrap.forktimes) > wrap.workers: # Limit ourselves to one process a second (over the period of # number of workers * 1 second). This will allow workers to # start up quickly but ensure we don't fork off children that # die instantly too quickly. if time.time() - wrap.forktimes[0] < wrap.workers: LOG.info(_LI('Forking too fast, sleeping')) time.sleep(1) wrap.forktimes.pop(0) wrap.forktimes.append(time.time()) pid = os.fork() if pid == 0: # NOTE(johannes): All exceptions are caught to ensure this # doesn't fallback into the loop spawning children. It would # be bad for a child to spawn more children. status = 0 try: self._child_process(wrap.service) except SignalExit as exc: signame = { signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT' }[exc.signo] LOG.info(_LI('Caught %s, exiting'), signame) status = exc.code except SystemExit as exc: status = exc.code except BaseException: LOG.exception(_LE('Unhandled exception')) status = 2 finally: wrap.service.stop() os._exit(status) LOG.info(_LI('Started child %d'), pid) wrap.children.add(pid) self.children[pid] = wrap return pid
def _start_child(self, wrap): if len(wrap.forktimes) > wrap.workers: # Limit ourselves to one process a second (over the period of # number of workers * 1 second). This will allow workers to # start up quickly but ensure we don't fork off children that # die instantly too quickly. if time.time() - wrap.forktimes[0] < wrap.workers: LOG.info(_LI('Forking too fast, sleeping')) time.sleep(1) wrap.forktimes.pop(0) wrap.forktimes.append(time.time()) pid = os.fork() if pid == 0: # NOTE(johannes): All exceptions are caught to ensure this # doesn't fallback into the loop spawning children. It would # be bad for a child to spawn more children. status = 0 try: self._child_process(wrap.service) except SignalExit as exc: signame = {signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT'}[exc.signo] LOG.info(_LI('Caught %s, exiting'), signame) status = exc.code except SystemExit as exc: status = exc.code except BaseException: LOG.exception(_LE('Unhandled exception')) status = 2 finally: wrap.service.stop() os._exit(status) LOG.info(_LI('Started child %d'), pid) wrap.children.add(pid) self.children[pid] = wrap return pid
def execute(self, action, worker_context): # Check for a loop where the router keeps failing to boot or # accept the configuration. if self.instance.attempts >= self.params.reboot_error_threshold: self.log.info(_LI("Dropping out of boot loop after %s trials"), self.instance.attempts) self.instance.set_error(worker_context) return action self.instance.boot(worker_context, self.router_image_uuid) self.log.debug("CreateInstance attempt %s/%s", self.instance.attempts, self.params.reboot_error_threshold) return action
def check_boot(self, worker_context): ready_states = (UP, CONFIGURED) if self.update_state(worker_context, silent=True) in ready_states: self.log.info(_LI("Router has booted, attempting initial config")) self.configure(worker_context, BOOTING, attempts=1) if self.state != CONFIGURED: self._check_boot_timeout() return self.state == CONFIGURED self.log.debug("Router is %s", self.state.upper()) return False
def update_router_status(self, router_id, status): try: self.api_client.update_router_status(router_id, status) except Exception as e: # We don't want to die just because we can't tell neutron # what the status of the router should be. Log the error # but otherwise ignore it. LOG.info(_LI( 'ignoring failure to update status for router %s to %s: %s'), router_id, status, e, )
def get_network_subnets(self, network_id): response = [] subnet_response = self.api_client.list_subnets(network_id=network_id) subnets = subnet_response['subnets'] for s in subnets: try: response.append(Subnet.from_dict(s)) except Exception as e: LOG.info(_LI('ignoring subnet %s (%s) on network %s: %s'), s.get('id'), s.get('cidr'), network_id, e) return response
def stop(self, worker_context): self._ensure_cache(worker_context) if self.state == GONE: self.log.info(_LI("Destroying router neutron has deleted")) else: self.log.info(_LI("Destroying router")) try: nova_client = worker_context.nova_client nova_client.destroy_instance(self.instance_info) except Exception: self.log.exception(_LE("Error deleting router instance")) start = time.time() while time.time() - start < cfg.CONF.boot_timeout: if not nova_client.get_instance_by_id(self.instance_info.id_): if self.state != GONE: self.state = DOWN return self.log.debug("Router has not finished stopping") time.sleep(cfg.CONF.retry_delay) self.log.error(_LE("Router failed to stop within %d secs"), cfg.CONF.boot_timeout)
def send_message(self, message): "Called when the worker put a message in the state machine queue" if self.deleted: # Ignore any more incoming messages self.driver.log.debug( 'deleted state machine, ignoring incoming message %s', message) return False # NOTE(dhellmann): This check is largely redundant with the # one in CalcAction.transition() but it may allow us to avoid # adding poll events to the queue at all, and therefore cut # down on the number of times a worker thread wakes up to # process something on a router that isn't going to actually # do any work. if message.crud == POLL and \ self.instance.state == states.ERROR: self.driver.log.info( _LI('Resource status is ERROR, ignoring POLL message: %s'), message, ) return False if message.crud == REBUILD: if message.body.get('image_uuid'): self.driver.log.info( _LI('Resource is being REBUILT with custom image %s'), message.body['image_uuid']) self.image_uuid = message.body['image_uuid'] else: self.image_uuid = self.driver.image_uuid self._queue.append(message.crud) queue_len = len(self._queue) if queue_len > self._queue_warning_threshold: logger = self.driver.log.warning else: logger = self.driver.log.debug logger(_LW('incoming message brings queue length to %s'), queue_len) return True
def update_router_status(self, router_id, status): try: self.api_client.update_router_status(router_id, status) except Exception as e: # We don't want to die just because we can't tell neutron # what the status of the router should be. Log the error # but otherwise ignore it. LOG.info( _LI('ignoring failure to update status for %s to %s: %s'), id, status, e, )
def execute(self, action, worker_context): # Check for a loop where the resource keeps failing to boot or # accept the configuration. if self.instance.attempts >= self.params.reboot_error_threshold: self.params.driver.log.info( _LI('Dropping out of boot loop after ' ' %s trials'), self.instance.attempts) self.instance.set_error(worker_context) return action self.instance.boot(worker_context) self.params.driver.log.debug('CreateInstance attempt %s/%s', self.instance.attempts, self.params.reboot_error_threshold) return action
def execute(self, action, worker_context): # Check for a loop where the resource keeps failing to boot or # accept the configuration. if self.instance.attempts >= self.params.reboot_error_threshold: self.params.driver.log.info(_LI('Dropping out of boot loop after ' ' %s trials'), self.instance.attempts) self.instance.set_error(worker_context) return action self.instance.boot(worker_context) self.params.driver.log.debug('CreateInstance attempt %s/%s', self.instance.attempts, self.params.reboot_error_threshold) return action
def wait(self): """Loop waiting on children to die and respawning as necessary""" LOG.debug('Full set of CONF:') CONF.log_opt_values(LOG, std_logging.DEBUG) while self.running: wrap = self._wait_child() if not wrap: # Yield to other threads if no children have exited # Sleep for a short time to avoid excessive CPU usage # (see bug #1095346) eventlet.greenthread.sleep(.01) continue while self.running and len(wrap.children) < wrap.workers: self._start_child(wrap) if self.sigcaught: signame = { signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT' }[self.sigcaught] LOG.info(_LI('Caught %s, stopping children'), signame) for pid in self.children: try: os.kill(pid, signal.SIGTERM) except OSError as exc: if exc.errno != errno.ESRCH: raise # Wait for children to die if self.children: LOG.info(_LI('Waiting on %d children to exit'), len(self.children)) while self.children: self._wait_child()
def wait(self): """Loop waiting on children to die and respawning as necessary""" LOG.debug('Full set of CONF:') CONF.log_opt_values(LOG, std_logging.DEBUG) while self.running: wrap = self._wait_child() if not wrap: # Yield to other threads if no children have exited # Sleep for a short time to avoid excessive CPU usage # (see bug #1095346) eventlet.greenthread.sleep(.01) continue while self.running and len(wrap.children) < wrap.workers: self._start_child(wrap) if self.sigcaught: signame = {signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT'}[self.sigcaught] LOG.info(_LI('Caught %s, stopping children'), signame) for pid in self.children: try: os.kill(pid, signal.SIGTERM) except OSError as exc: if exc.errno != errno.ESRCH: raise # Wait for children to die if self.children: LOG.info( _LI('Waiting on %d children to exit'), len(self.children)) while self.children: self._wait_child()
def stop(self): """Shutdown all workers cleanly. """ LOG.info('shutting down scheduler') # Send a poison pill to all of the workers for w in self.workers: LOG.debug('sending stop message to %s', w['worker'].name) w['queue'].put(None) # Wait for the workers to finish and be ready to exit. for w in self.workers: LOG.debug('waiting for queue for %s', w['worker'].name) w['queue'].close() LOG.debug('waiting for worker %s', w['worker'].name) w['worker'].join() LOG.info(_LI('scheduler shutdown'))
def run(self, ip_address, port=cfg.CONF.rug_api_port): app = RugAPI() for i in xrange(5): LOG.info(_LI("Starting the rug-api on %s/%s"), ip_address, port) try: sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128) except socket.error as err: if err.errno != 99: # EADDRNOTAVAIL raise LOG.warning(_LW("Could not create rug-api socket: %s"), err) LOG.warning(_LW("Sleeping %s before trying again"), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError(_("Could not establish rug-api socket on %s/%s") % (ip_address, port)) eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
def ignore_signals(): """Ignore signals that might interrupt processing Since the RUG doesn't want to be asynchronously interrupted, various signals received needs to be ignored. The registered signals including SIGHUP, SIGALRM, and default signals SIGUSR1 and SIGUSR2 are captured and ignored through the SIG_IGN action. :param: None :returns: None """ for s in [signal.SIGHUP, signal.SIGUSR1, signal.SIGUSR2, signal.SIGALRM]: logging.getLogger(__name__).info(_LI('ignoring signal %s'), s) signal.signal(s, signal.SIG_IGN)
def run(self, ip_address, port=RUG_META_PORT): app = MetadataProxyHandler() for i in xrange(5): LOG.info(_LI("Starting the metadata proxy on %s/%s"), ip_address, port) try: sock = eventlet.listen((ip_address, port), family=socket.AF_INET6, backlog=128) except socket.error as err: if err.errno != 99: raise LOG.warning(_LW("Could not create metadata proxy socket: %s"), err) LOG.warning(_LW("Sleeping %s before trying again"), i + 1) eventlet.sleep(i + 1) else: break else: raise RuntimeError(_("Could not establish metadata proxy socket on %s/%s") % (ip_address, port)) eventlet.wsgi.server(sock, app, custom_pool=self.pool, log=loggers.WritableLogger(LOG))
def shuffle_notifications(notification_queue, sched): """Copy messages from the notification queue into the scheduler. """ while True: try: target, message = notification_queue.get() if target is None: break sched.handle_message(target, message) except IOError: # FIXME(rods): if a signal arrive during an IO operation # an IOError is raised. We catch the exceptions in # meantime waiting for a better solution. pass except KeyboardInterrupt: LOG.info(_LI('got Ctrl-C')) break except: LOG.exception(_LE('unhandled exception processing message'))
def boot(self, worker_context): """Boots the instance with driver pre/post boot hooks. :returns: None """ self._ensure_cache(worker_context) self.log.info('Booting %s' % self.driver.RESOURCE_NAME) self.state = states.DOWN self._boot_counter.start() # driver preboot hook self.driver.pre_boot(worker_context) # try to boot the instance try: instance_info = worker_context.nova_client.boot_instance( self.instance_info, self.driver.name, self.driver.image_uuid, self.driver.flavor, self.driver.make_ports(worker_context) ) if not instance_info: self.log.info(_LI('Previous instance is still deleting')) # Reset the boot counter, causing the state machine to start # again with a new Instance. self.reset_boot_counter() self.instance_info = None return except: self.log.exception(_LE('Instance failed to start boot')) return else: # We have successfully started a (re)boot attempt so # record the timestamp so we can report how long it takes. self.state = states.BOOTING self.instance_info = instance_info # driver post boot hook self.driver.post_boot(worker_context)
def _check_boot_timeout(self): time_since_boot = self.instance_info.time_since_boot if time_since_boot: if time_since_boot.seconds < cfg.CONF.boot_timeout: # Do not reset the state if we have an error # condition already. The state will be reset when # the router starts responding again, or when the # error is cleared from a forced rebuild. if self.state != ERROR: self.state = BOOTING else: # If the instance was created more than `boot_timeout` seconds # ago, log an error and set the state set to DOWN self.log.info(_LI("Router is DOWN. Created over %d secs ago."), cfg.CONF.boot_timeout) # Do not reset the state if we have an error condition # already. The state will be reset when the router starts # responding again, or when the error is cleared from a # forced rebuild. if self.state != ERROR: self.state = DOWN
def get_default_v4_gateway(client, router, networks): """Find the IPv4 default gateway for the router. """ LOG.debug('networks = %r', networks) LOG.debug('external interface = %s', router.external_port.mac_address) # Now find the subnet that our external IP is on, and return its # gateway. for n in networks: if n['network_type'] == EXTERNAL_NET: v4_addresses = [ addr for addr in (netaddr.IPAddress(ip.partition('/')[0]) for ip in n['interface']['addresses']) if addr.version == 4 ] for s in n['subnets']: subnet = netaddr.IPNetwork(s['cidr']) if subnet.version != 4: continue LOG.debug( '%s: checking if subnet %s should have the default route', router.id, s['cidr']) for addr in v4_addresses: if addr in subnet: LOG.debug( '%s: found gateway %s for subnet %s on network %s', router.id, s['gateway_ip'], s['cidr'], n['network_id'], ) return s['gateway_ip'] # Sometimes we are asked to build a configuration for the server # when the external interface is still marked as "down". We can # report that case, but we don't treat it as an error here because # we'll be asked to do it again when the interface comes up. LOG.info(_LI('%s: no default gateway was found'), router.id) return ''
def wait(self): signal.signal(signal.SIGTERM, self._handle_signal) signal.signal(signal.SIGINT, self._handle_signal) LOG.debug('Full set of CONF:') CONF.log_opt_values(LOG, std_logging.DEBUG) status = None try: super(ServiceLauncher, self).wait() except SignalExit as exc: signame = {signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT'}[exc.signo] LOG.info(_LI('Caught %s, exiting'), signame) status = exc.code except SystemExit as exc: status = exc.code finally: if rpc: rpc.cleanup() self.stop() return status
def wait(self): signal.signal(signal.SIGTERM, self._handle_signal) signal.signal(signal.SIGINT, self._handle_signal) LOG.debug('Full set of CONF:') CONF.log_opt_values(LOG, std_logging.DEBUG) status = None try: super(ServiceLauncher, self).wait() except SignalExit as exc: signame = { signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT' }[exc.signo] LOG.info(_LI('Caught %s, exiting'), signame) status = exc.code except SystemExit as exc: status = exc.code finally: if rpc: rpc.cleanup() self.stop() return status
def main(argv=sys.argv[1:]): """Main Entry point into the akanda-rug This is the main entry point into the akanda-rug. On invocation of this method, logging, local network connectivity setup is performed. This information is obtained through the 'ak-config' file, passed as arguement to this method. Worker threads are spawned for handling various tasks that are associated with processing as well as responding to different Neutron events prior to starting a notification dispatch loop. :param argv: list of Command line arguments :returns: None :raises: None """ # TODO(rama) Error Handling to be added as part of the docstring # description # Change the process and thread name so the logs are cleaner. p = multiprocessing.current_process() p.name = 'pmain' t = threading.current_thread() t.name = 'tmain' ak_cfg.parse_config(argv) log.setup(cfg.CONF, 'akanda-rug') cfg.CONF.log_opt_values(LOG, logging.INFO) neutron = neutron_api.Neutron(cfg.CONF) # TODO(mark): develop better way restore after machine reboot # neutron.purge_management_interface() # bring the mgt tap interface up neutron.ensure_local_service_port() # bring the external port if cfg.CONF.plug_external_port: neutron.ensure_local_external_port() # Set up the queue to move messages between the eventlet-based # listening process and the scheduler. notification_queue = multiprocessing.Queue() # Ignore signals that might interrupt processing. daemon.ignore_signals() # If we see a SIGINT, stop processing. def _stop_processing(*args): notification_queue.put((None, None)) signal.signal(signal.SIGINT, _stop_processing) # Listen for notifications. notification_proc = multiprocessing.Process( target=notifications.listen, kwargs={'notification_queue': notification_queue}, name='notification-listener', ) notification_proc.start() mgt_ip_address = neutron_api.get_local_service_ip(cfg.CONF).split('/')[0] metadata_proc = multiprocessing.Process(target=metadata.serve, args=(mgt_ip_address, ), name='metadata-proxy') metadata_proc.start() from akanda.rug.api import rug as rug_api rug_api_proc = multiprocessing.Process(target=rug_api.serve, args=(mgt_ip_address, ), name='rug-api') rug_api_proc.start() # Set up the notifications publisher Publisher = (notifications.Publisher if cfg.CONF.ceilometer.enabled else notifications.NoopPublisher) publisher = Publisher(topic=cfg.CONF.ceilometer.topic, ) # Set up a factory to make Workers that know how many threads to # run. worker_factory = functools.partial(worker.Worker, notifier=publisher) # Set up the scheduler that knows how to manage the routers and # dispatch messages. sched = scheduler.Scheduler(worker_factory=worker_factory, ) # Prepopulate the workers with existing routers on startup populate.pre_populate_workers(sched) # Set up the periodic health check health.start_inspector(cfg.CONF.health_check_period, sched) # Block the main process, copying messages from the notification # listener to the scheduler try: shuffle_notifications(notification_queue, sched) finally: LOG.info(_LI('Stopping scheduler.')) sched.stop() LOG.info(_LI('Stopping notification publisher.')) publisher.stop() # Terminate the subprocesses for subproc in [notification_proc, metadata_proc, rug_api_proc]: LOG.info(_LI('Stopping %s.'), subproc.name) subproc.terminate()
def update_state(self, worker_context, silent=False): """Updates state of the instance and, by extension, its logical resource :param worker_context: :param silent: :returns: state """ self._ensure_cache(worker_context) if self.driver.get_state(worker_context) == states.GONE: self.log.debug('%s driver reported its state is GONE', self.driver.RESOURCE_NAME) self.state = states.GONE return self.state if self.instance_info is None: self.log.info(_LI('no backing instance, marking as down')) self.state = states.DOWN return self.state for i in xrange(cfg.CONF.max_retries): if self.driver.is_alive(self.instance_info.management_address): if self.state != states.CONFIGURED: self.state = states.UP break if not silent: self.log.debug('Alive check failed. Attempt %d of %d', i, cfg.CONF.max_retries) time.sleep(cfg.CONF.retry_delay) else: old_state = self.state self._check_boot_timeout() # If the instance isn't responding, make sure Nova knows about it instance = worker_context.nova_client.get_instance_for_obj(self.id) if instance is None and self.state != states.ERROR: self.log.info('No instance was found; rebooting') self.state = states.DOWN self.instance_info = None # update_state() is called from Alive() to check the # status of the router. If we can't talk to the API at # that point, the router should be considered missing and # we should reboot it, so mark it states.DOWN if we think it was # configured before. if old_state == states.CONFIGURED and self.state != states.ERROR: self.log.debug('Instance not alive, marking it as DOWN') self.state = states.DOWN # After the instance is all the way up, record how long it took # to boot and accept a configuration. if self.instance_info.booting and self.state == states.CONFIGURED: # If we didn't boot the instance (because we were restarted # while it remained running, for example), we won't have a # duration to log. self.instance_info.confirm_up() if self.instance_info.boot_duration: self.log.info('%s booted in %s seconds after %s attempts', self.driver.RESOURCE_NAME, self.instance_info.boot_duration.total_seconds(), self._boot_counter.count) # Always reset the boot counter, even if we didn't boot # the server ourself, so we don't accidentally think we # have an erroring router. self._boot_counter.reset() return self.state
def _dispatch_command(self, target, message): instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() # NOTE(adam_g): Drop 'router-debug' compat in M. elif (instructions['command'] == commands.RESOURCE_DEBUG or instructions['command'] == commands.ROUTER_DEBUG): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to debug resource with no id')) return reason = instructions.get('reason') if resource_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all resources with %r'), resource_id) else: LOG.info(_LI('Placing router %s in debug mode (reason: %s)'), resource_id, reason) self.db_api.enable_resource_debug(resource_id, reason) elif (instructions['command'] == commands.RESOURCE_MANAGE or instructions['command'] == commands.ROUTER_MANAGE): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to manage resource with no id')) return try: self.db_api.disable_resource_debug(resource_id) LOG.info(_LI('Resuming management of resource %s'), resource_id) except KeyError: pass try: self._resource_locks[resource_id].release() LOG.info(_LI('Unlocked resource %s'), resource_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in EVENT_COMMANDS: resource_id = instructions.get('resource_id') sm = self._find_state_machine_by_resource_id(resource_id) if not sm: LOG.debug( 'Will not process command, no managed state machine ' 'found for resource %s', resource_id) return new_res = event.Resource( id=resource_id, driver=sm.driver.RESOURCE_NAME, tenant_id=sm.tenant_id) new_msg = event.Event( resource=new_res, crud=EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_res) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_res) # NOTE(adam_g): This is here to support the deprecated old format of # sending commands to specific routers and can be # removed once the CLI component is dropped in M. elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS: print 'XXX DEPR' new_rsc = event.Resource( driver=drivers.router.Router.RESOURCE_NAME, id=message.body.get('router_id'), tenant_id=message.body.get('tenant_id'), ) new_msg = event.Event( resource=new_rsc, crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_rsc) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_rsc) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def _thread_target(self): """This method runs in each worker thread. """ my_id = threading.current_thread().name LOG.debug('starting thread') # Use a separate context from the one we use when receiving # messages and talking to the tenant router manager because we # are in a different thread and the clients are not # thread-safe. context = WorkerContext() while self._keep_going: try: # Try to get a state machine from the work queue. If # there's nothing to do, we will block for a while. self._thread_status[my_id] = 'waiting for task' sm = self.work_queue.get(timeout=10) except Queue.Empty: continue if sm is None: LOG.info(_LI('received stop message')) break # Make sure we didn't already have some updates under way # for a router we've been told to ignore for debug mode. should_ignore, reason = \ self.db_api.resource_in_debug(sm.resource_id) if should_ignore: LOG.debug('Skipping update of resource %s in debug mode. ' '(reason: %s)', sm.resource_id, reason) continue # FIXME(dhellmann): Need to look at the router to see if # it belongs to a tenant which is in debug mode, but we # don't have that data in the sm, yet. LOG.debug('performing work on %s for tenant %s', sm.resource_id, sm.tenant_id) try: self._thread_status[my_id] = 'updating %s' % sm.resource_id sm.update(context) except: LOG.exception(_LE('could not complete update for %s'), sm.resource_id) finally: self._thread_status[my_id] = ( 'finalizing task for %s' % sm.resource_id ) self.work_queue.task_done() with self.lock: # Release the lock that prevents us from adding # the state machine back into the queue. If we # find more work, we will re-acquire it. If we do # not find more work, we hold the primary work # queue lock so the main thread cannot put the # state machine back into the queue until we # release that lock. self._release_resource_lock(sm) # The state machine has indicated that it is done # by returning. If there is more work for it to # do, reschedule it by placing it at the end of # the queue. if sm.has_more_work(): LOG.debug('%s has more work, returning to work queue', sm.resource_id) self._add_resource_to_work_queue(sm) else: LOG.debug('%s has no more work', sm.resource_id) # Return the context object so tests can look at it self._thread_status[my_id] = 'exiting' return context
def _dispatch_command(self, target, message): instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() elif instructions['command'] == commands.ROUTER_DEBUG: router_id = instructions['router_id'] reason = instructions.get('reason') if router_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all routers with %r'), router_id) else: LOG.info(_LI('Placing router %s in debug mode (reason: %s)'), router_id, reason) self.db_api.enable_router_debug(router_id, reason) elif instructions['command'] == commands.ROUTER_MANAGE: router_id = instructions['router_id'] try: self.db_api.disable_router_debug(router_id) LOG.info(_LI('Resuming management of router %s'), router_id) except KeyError: pass try: self._router_locks[router_id].release() LOG.info(_LI('Unlocked router %s'), router_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in self._EVENT_COMMANDS: new_msg = event.Event( tenant_id=message.tenant_id, router_id=message.router_id, crud=self._EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], message.tenant_id) self.handle_message(new_msg.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], message.tenant_id) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def _ensure_local_port(self, network_id, subnet_id, network_type, ip_address): driver = importutils.import_object(self.conf.interface_driver, self.conf) host_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, socket.gethostname())) name = 'AKANDA:RUG:%s' % network_type.upper() query_dict = dict(device_owner=DEVICE_OWNER_RUG, device_id=host_id, name=name, network_id=network_id) ports = self.api_client.list_ports(**query_dict)['ports'] if ports: port = Port.from_dict(ports[0]) LOG.info(_LI('already have local %s port, using %r'), network_type, port) else: LOG.info(_LI('creating a new local %s port'), network_type) port_dict = { 'admin_state_up': True, 'network_id': network_id, 'device_owner': DEVICE_OWNER_ROUTER_INT, # lying here for IP 'name': name, 'device_id': host_id, 'fixed_ips': [{ 'ip_address': ip_address.split('/')[0], 'subnet_id': subnet_id }], 'binding:host_id': socket.gethostname() } port = Port.from_dict( self.api_client.create_port(dict(port=port_dict))['port']) # remove lie that enabled us pick IP on slaac subnet self.api_client.update_port( port.id, {'port': { 'device_owner': DEVICE_OWNER_RUG }}) port.device_owner = DEVICE_OWNER_RUG LOG.info(_LI('new local %s port: %r'), network_type, port) # create the tap interface if it doesn't already exist if not ip_lib.device_exists(driver.get_device_name(port)): driver.plug(port.network_id, port.id, driver.get_device_name(port), port.mac_address) # add sleep to ensure that port is setup before use time.sleep(1) driver.init_l3(driver.get_device_name(port), [ip_address]) return port
def launch_service(self, service, workers=1): wrap = ServiceWrapper(service, workers) LOG.info(_LI('Starting %d workers'), wrap.workers) while self.running and len(wrap.children) < wrap.workers: self._start_child(wrap)