def stop(self, worker_context): """Attempts to destroy the instance with configured timeout. :param worker_context: :returns: """ self._ensure_cache(worker_context) self.log.info(_LI('Destroying instance')) if not self.instance_info: self.log.info(_LI('Instance already destroyed.')) return try: worker_context.nova_client.destroy_instance(self.instance_info) except Exception: self.log.exception(_LE('Error deleting router instance')) start = time.time() while time.time() - start < cfg.CONF.boot_timeout: if not worker_context.nova_client.\ get_instance_by_id(self.instance_info.id_): if self.state != states.GONE: self.state = states.DOWN return self.state self.log.debug('Router has not finished stopping') time.sleep(cfg.CONF.retry_delay) self.log.error(_LE( 'Router failed to stop within %d secs'), cfg.CONF.boot_timeout)
def start(self, interval, initial_delay=None): self._running = True done = event.Event() def _inner(): if initial_delay: greenthread.sleep(initial_delay) try: while self._running: start = timeutils.utcnow() self.f(*self.args, **self.kw) end = timeutils.utcnow() if not self._running: break delay = interval - timeutils.delta_seconds(start, end) if delay <= 0: LOG.warning(_LW( 'task run outlasted interval by %s sec'), -delay ) greenthread.sleep(delay if delay > 0 else 0) except LoopingCallDone, e: self.stop() done.send(e.retvalue) except Exception: LOG.exception(_LE('in looping call')) done.send_exception(*sys.exc_info()) return
def run_periodic_tasks(self, context, raise_on_error=False): """Tasks to be run at a periodic interval.""" for task_name, task in self._periodic_tasks: full_task_name = '.'.join([self.__class__.__name__, task_name]) ticks_to_skip = self._ticks_to_skip[task_name] if ticks_to_skip > 0: LOG.debug("Skipping %(full_task_name)s, %(ticks_to_skip)s" " ticks left until next run", dict(full_task_name=full_task_name, ticks_to_skip=ticks_to_skip)) self._ticks_to_skip[task_name] -= 1 continue self._ticks_to_skip[task_name] = task._ticks_between_runs LOG.debug("Running periodic task %(full_task_name)s", dict(full_task_name=full_task_name)) try: task(self, context) except Exception: if raise_on_error: raise LOG.exception(_LE("Error during %(full_task_name)s:"), dict(full_task_name=full_task_name))
def get_bridge_for_iface(root_helper, iface): args = ["ovs-vsctl", "--timeout=2", "iface-to-br", iface] try: return utils.execute(args, root_helper=root_helper).strip() except Exception: LOG.exception(_LE("Interface %s not found."), iface) return None
def get_bridges(root_helper): args = ["ovs-vsctl", "--timeout=2", "list-br"] try: return utils.execute(args, root_helper=root_helper).strip().split("\n") except Exception: LOG.exception(_LE("Unable to retrieve bridges.")) return []
def __call__(self, req): try: if req.method != 'PUT': return webob.exc.HTTPMethodNotAllowed() args = filter(None, req.path.split('/')) if not args: return webob.exc.HTTPNotFound() command, _, _ = self.ctl.command_manager.find_command(args) if command.interactive: return webob.exc.HTTPNotImplemented() return str(self.ctl.run(['--debug'] + args)) except SystemExit: # cliff invokes -h (help) on argparse failure # (which in turn results in sys.exit call) return webob.exc.HTTPBadRequest() except ValueError: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = _('An unknown error has occurred. ' 'Please try your request again.') return webob.exc.HTTPInternalServerError(explanation=unicode(msg))
def update(self, worker_context): "Called when the router config should be changed" while self._queue: while True: if self.deleted: self.log.debug("skipping update because the router is being deleted") return try: self.log.debug("%s.execute(%s) instance.state=%s", self.state, self.action, self.instance.state) self.action = self.state.execute(self.action, worker_context) self.log.debug("%s.execute -> %s instance.state=%s", self.state, self.action, self.instance.state) except: self.log.exception(_LE("%s.execute() failed for action: %s"), self.state, self.action) old_state = self.state self.state = self.state.transition(self.action, worker_context) self.log.debug( "%s.transition(%s) -> %s instance.state=%s", old_state, self.action, self.state, self.instance.state ) # Yield control each time we stop to figure out what # to do next. if isinstance(self.state, CalcAction): return # yield # We have reached the exit state, so the router has # been deleted somehow. if isinstance(self.state, Exit): self._do_delete() return
def start(self, interval, initial_delay=None): self._running = True done = event.Event() def _inner(): if initial_delay: greenthread.sleep(initial_delay) try: while self._running: start = timeutils.utcnow() self.f(*self.args, **self.kw) end = timeutils.utcnow() if not self._running: break delay = interval - timeutils.delta_seconds(start, end) if delay <= 0: LOG.warning( _LW('task run outlasted interval by %s sec'), -delay) greenthread.sleep(delay if delay > 0 else 0) except LoopingCallDone, e: self.stop() done.send(e.retvalue) except Exception: LOG.exception(_LE('in looping call')) done.send_exception(*sys.exc_info()) return
def run_periodic_tasks(self, context, raise_on_error=False): """Tasks to be run at a periodic interval.""" for task_name, task in self._periodic_tasks: full_task_name = '.'.join([self.__class__.__name__, task_name]) ticks_to_skip = self._ticks_to_skip[task_name] if ticks_to_skip > 0: LOG.debug( "Skipping %(full_task_name)s, %(ticks_to_skip)s" " ticks left until next run", dict(full_task_name=full_task_name, ticks_to_skip=ticks_to_skip)) self._ticks_to_skip[task_name] -= 1 continue self._ticks_to_skip[task_name] = task._ticks_between_runs LOG.debug("Running periodic task %(full_task_name)s", dict(full_task_name=full_task_name)) try: task(self, context) except Exception: if raise_on_error: raise LOG.exception(_LE("Error during %(full_task_name)s:"), dict(full_task_name=full_task_name))
def unplug(self, device_name, bridge=None, namespace=None, prefix=None): """Unplug the interface.""" device = ip_lib.IPDevice(device_name, self.root_helper, namespace) try: device.link.delete() LOG.debug("Unplugged interface '%s'", device_name) except RuntimeError: LOG.exception(_LE("Failed unplugging interface '%s'"), device_name)
def shutdown(self): LOG.info('shutting down') for resource_id, sm in self.state_machines.items(): try: sm.service_shutdown() except Exception: LOG.exception(_LE('Failed to shutdown state machine for %s'), resource_id)
def shutdown(self): LOG.info('shutting down') for rid, sm in self.state_machines.items(): try: sm.service_shutdown() except Exception: LOG.exception(_LE( 'Failed to shutdown state machine for %s'), rid )
def unplug(self, device_name, bridge=None, namespace=None, prefix=None): """Unplug the interface.""" device = ip_lib.IPDevice(device_name, self.root_helper, namespace) try: device.link.delete() LOG.debug("Unplugged interface '%s'", device_name) except RuntimeError: LOG.exception(_LE( "Failed unplugging interface '%s'"), device_name)
def run_ofctl(self, cmd, args): full_args = ["ovs-ofctl", cmd, self.br_name] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error( _LE("Unable to execute %(cmd)s. Exception: %(exception)s"), { 'cmd': full_args, 'exception': e })
def run_vsctl(self, args): full_args = ["ovs-vsctl", "--timeout=2"] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error( _LE("Unable to execute %(cmd)s. Exception: %(exception)s"), { 'cmd': full_args, 'exception': e })
def notify(context, publisher_id, event_type, priority, payload): """Sends a notification using the specified driver :param publisher_id: the source worker_type.host of the message :param event_type: the literal type of event (ex. Instance Creation) :param priority: patterned after the enumeration of Python logging levels in the set (DEBUG, WARN, INFO, ERROR, CRITICAL) :param payload: A python dictionary of attributes Outgoing message format includes the above parameters, and appends the following: message_id a UUID representing the id for this notification timestamp the GMT timestamp the notification was sent at The composite message will be constructed as a dictionary of the above attributes, which will then be sent via the transport mechanism defined by the driver. Message example:: {'message_id': str(uuid.uuid4()), 'publisher_id': 'compute.host1', 'timestamp': timeutils.utcnow(), 'priority': 'WARN', 'event_type': 'compute.create_instance', 'payload': {'instance_id': 12, ... }} """ if priority not in log_levels: raise BadPriorityException( _('%s not in valid priorities') % priority) # Ensure everything is JSON serializable. payload = jsonutils.to_primitive(payload, convert_instances=True) msg = dict(message_id=str(uuid.uuid4()), publisher_id=publisher_id, event_type=event_type, priority=priority, payload=payload, timestamp=str(timeutils.utcnow())) for driver in _get_drivers(): try: driver.notify(context, msg) except Exception as e: LOG.exception(_LE("Problem '%(e)s' attempting to " "send to notification system. " "Payload=%(payload)s"), dict(e=e, payload=payload))
def notify(context, publisher_id, event_type, priority, payload): """Sends a notification using the specified driver :param publisher_id: the source worker_type.host of the message :param event_type: the literal type of event (ex. Instance Creation) :param priority: patterned after the enumeration of Python logging levels in the set (DEBUG, WARN, INFO, ERROR, CRITICAL) :param payload: A python dictionary of attributes Outgoing message format includes the above parameters, and appends the following: message_id a UUID representing the id for this notification timestamp the GMT timestamp the notification was sent at The composite message will be constructed as a dictionary of the above attributes, which will then be sent via the transport mechanism defined by the driver. Message example:: {'message_id': str(uuid.uuid4()), 'publisher_id': 'compute.host1', 'timestamp': timeutils.utcnow(), 'priority': 'WARN', 'event_type': 'compute.create_instance', 'payload': {'instance_id': 12, ... }} """ if priority not in log_levels: raise BadPriorityException(_('%s not in valid priorities') % priority) # Ensure everything is JSON serializable. payload = jsonutils.to_primitive(payload, convert_instances=True) msg = dict(message_id=str(uuid.uuid4()), publisher_id=publisher_id, event_type=event_type, priority=priority, payload=payload, timestamp=str(timeutils.utcnow())) for driver in _get_drivers(): try: driver.notify(context, msg) except Exception as e: LOG.exception( _LE("Problem '%(e)s' attempting to " "send to notification system. " "Payload=%(payload)s"), dict(e=e, payload=payload))
def stop(self, worker_context): self._ensure_cache(worker_context) if self.state == GONE: self.log.info(_LI("Destroying router neutron has deleted")) else: self.log.info(_LI("Destroying router")) try: nova_client = worker_context.nova_client nova_client.destroy_instance(self.instance_info) except Exception: self.log.exception(_LE("Error deleting router instance")) start = time.time() while time.time() - start < cfg.CONF.boot_timeout: if not nova_client.get_instance_by_id(self.instance_info.id_): if self.state != GONE: self.state = DOWN return self.log.debug("Router has not finished stopping") time.sleep(cfg.CONF.retry_delay) self.log.error(_LE("Router failed to stop within %d secs"), cfg.CONF.boot_timeout)
def get_xapi_iface_id(self, xs_vif_uuid): args = [ "xe", "vif-param-get", "param-name=other-config", "param-key=nicira-iface-id", "uuid=%s" % xs_vif_uuid ] try: return utils.execute(args, root_helper=self.root_helper).strip() except Exception, e: LOG.error( _LE("Unable to execute %(cmd)s. Exception: %(exception)s"), { 'cmd': args, 'exception': e })
def __call__(self, req): try: LOG.debug("Request: %s", req) instance_id = self._get_instance_id(req) if instance_id: return self._proxy_request(instance_id, req) else: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = "An unknown error has occurred. " "Please try your request again." return webob.exc.HTTPInternalServerError(explanation=unicode(msg))
def __call__(self, req): try: LOG.debug("Request: %s", req) instance_id = self._get_instance_id(req) if instance_id: return self._proxy_request(instance_id, req) else: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = ('An unknown error has occurred. ' 'Please try your request again.') return webob.exc.HTTPInternalServerError(explanation=unicode(msg))
def add_driver(notification_driver): """Add a notification driver at runtime.""" # Make sure the driver list is initialized. _get_drivers() if isinstance(notification_driver, basestring): # Load and add try: driver = importutils.import_module(notification_driver) _drivers[notification_driver] = driver except ImportError: LOG.exception(_LE("Failed to load notifier %s. " "These notifications will not be sent."), notification_driver) else: # Driver is already loaded; just add the object. _drivers[notification_driver] = notification_driver
def add_driver(notification_driver): """Add a notification driver at runtime.""" # Make sure the driver list is initialized. _get_drivers() if isinstance(notification_driver, basestring): # Load and add try: driver = importutils.import_module(notification_driver) _drivers[notification_driver] = driver except ImportError: LOG.exception( _LE("Failed to load notifier %s. " "These notifications will not be sent."), notification_driver) else: # Driver is already loaded; just add the object. _drivers[notification_driver] = notification_driver
def boot(self, worker_context, router_image_uuid): self._ensure_cache(worker_context) if self.state == GONE: self.log.info(_LI("Not booting deleted router")) return self.log.info(_LI("Booting router")) self.state = DOWN self._boot_counter.start() def make_vrrp_ports(): mgt_port = worker_context.neutron.create_management_port(self.router_obj.id) # FIXME(mark): ideally this should be ordered and de-duped instance_ports = [ worker_context.neutron.create_vrrp_port(self.router_obj.id, n) for n in (p.network_id for p in self.router_obj.ports) ] return mgt_port, instance_ports try: # TODO(mark): make this pluggable self._ensure_provider_ports(self.router_obj, worker_context) # TODO(mark): make this handle errors more gracefully on cb fail # TODO(mark): checkout from a pool - boot on demand for now instance_info = worker_context.nova_client.boot_instance( self.instance_info, self.router_obj.id, router_image_uuid, make_vrrp_ports ) if not instance_info: self.log.info(_LI("Previous router is deleting")) # Reset the VM manager, causing the state machine to start # again with a new VM. self.reset_boot_counter() self.instance_info = None return except: self.log.exception(_LE("Router failed to start boot")) # TODO(mark): attempt clean-up of failed ports return else: # We have successfully started a (re)boot attempt so # record the timestamp so we can report how long it takes. self.state = BOOTING self.instance_info = instance_info
def _start_child(self, wrap): if len(wrap.forktimes) > wrap.workers: # Limit ourselves to one process a second (over the period of # number of workers * 1 second). This will allow workers to # start up quickly but ensure we don't fork off children that # die instantly too quickly. if time.time() - wrap.forktimes[0] < wrap.workers: LOG.info(_LI('Forking too fast, sleeping')) time.sleep(1) wrap.forktimes.pop(0) wrap.forktimes.append(time.time()) pid = os.fork() if pid == 0: # NOTE(johannes): All exceptions are caught to ensure this # doesn't fallback into the loop spawning children. It would # be bad for a child to spawn more children. status = 0 try: self._child_process(wrap.service) except SignalExit as exc: signame = { signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT' }[exc.signo] LOG.info(_LI('Caught %s, exiting'), signame) status = exc.code except SystemExit as exc: status = exc.code except BaseException: LOG.exception(_LE('Unhandled exception')) status = 2 finally: wrap.service.stop() os._exit(status) LOG.info(_LI('Started child %d'), pid) wrap.children.add(pid) self.children[pid] = wrap return pid
def _send(self, ready): """Deliver notification messages from the in-process queue to the appropriate topic via the AMQP service. """ # setup notifier driver ahead a time self.get_notifier() # Tell the start() method that we have set up the AMQP # communication stuff and are ready to do some work. ready.set() while True: msg = self._q.get() if msg is None: break LOG.debug('sending notification %r', msg) try: self.send(event_type=msg['event_type'], message=msg['payload']) except Exception: LOG.exception(_LE('could not publish notification'))
def unplug(self, device_name, bridge=None, namespace=None, prefix=None): """Unplug the interface.""" if not bridge: bridge = self.conf.ovs_integration_bridge tap_name = self._get_tap_name(device_name, prefix) self.check_bridge_exists(bridge) ovs = ovs_lib.OVSBridge(bridge, self.root_helper) try: ovs.delete_port(tap_name) if self.conf.ovs_use_veth: device = ip_lib.IPDevice(device_name, self.root_helper, namespace) device.link.delete() LOG.debug(_("Unplugged interface '%s'"), device_name) except RuntimeError: LOG.exception(_LE("Failed unplugging interface '%s'"), device_name)
def update(self, worker_context): "Called when the router config should be changed" while self._queue: while True: if self.deleted: self.driver.log.debug( 'skipping update because the router is being deleted') return try: self.driver.log.debug('%s.execute(%s) instance.state=%s', self.state, self.action, self.instance.state) self.action = self.state.execute( self.action, worker_context, ) self.driver.log.debug('%s.execute -> %s instance.state=%s', self.state, self.action, self.instance.state) except: self.driver.log.exception( _LE('%s.execute() failed for action: %s'), self.state, self.action) old_state = self.state self.state = self.state.transition( self.action, worker_context, ) self.driver.log.debug( '%s.transition(%s) -> %s instance.state=%s', old_state, self.action, self.state, self.instance.state) # Yield control each time we stop to figure out what # to do next. if isinstance(self.state, CalcAction): return # yield # We have reached the exit state, so the router has # been deleted somehow. if isinstance(self.state, Exit): self._do_delete() return
def shuffle_notifications(notification_queue, sched): """Copy messages from the notification queue into the scheduler. """ while True: try: target, message = notification_queue.get() if target is None: break sched.handle_message(target, message) except IOError: # FIXME(rods): if a signal arrive during an IO operation # an IOError is raised. We catch the exceptions in # meantime waiting for a better solution. pass except KeyboardInterrupt: LOG.info(_LI('got Ctrl-C')) break except: LOG.exception(_LE('unhandled exception processing message'))
def _start_child(self, wrap): if len(wrap.forktimes) > wrap.workers: # Limit ourselves to one process a second (over the period of # number of workers * 1 second). This will allow workers to # start up quickly but ensure we don't fork off children that # die instantly too quickly. if time.time() - wrap.forktimes[0] < wrap.workers: LOG.info(_LI('Forking too fast, sleeping')) time.sleep(1) wrap.forktimes.pop(0) wrap.forktimes.append(time.time()) pid = os.fork() if pid == 0: # NOTE(johannes): All exceptions are caught to ensure this # doesn't fallback into the loop spawning children. It would # be bad for a child to spawn more children. status = 0 try: self._child_process(wrap.service) except SignalExit as exc: signame = {signal.SIGTERM: 'SIGTERM', signal.SIGINT: 'SIGINT'}[exc.signo] LOG.info(_LI('Caught %s, exiting'), signame) status = exc.code except SystemExit as exc: status = exc.code except BaseException: LOG.exception(_LE('Unhandled exception')) status = 2 finally: wrap.service.stop() os._exit(status) LOG.info(_LI('Started child %d'), pid) wrap.children.add(pid) self.children[pid] = wrap return pid
def boot(self, worker_context): """Boots the instance with driver pre/post boot hooks. :returns: None """ self._ensure_cache(worker_context) self.log.info('Booting %s' % self.driver.RESOURCE_NAME) self.state = states.DOWN self._boot_counter.start() # driver preboot hook self.driver.pre_boot(worker_context) # try to boot the instance try: instance_info = worker_context.nova_client.boot_instance( self.instance_info, self.driver.name, self.driver.image_uuid, self.driver.flavor, self.driver.make_ports(worker_context) ) if not instance_info: self.log.info(_LI('Previous instance is still deleting')) # Reset the boot counter, causing the state machine to start # again with a new Instance. self.reset_boot_counter() self.instance_info = None return except: self.log.exception(_LE('Instance failed to start boot')) return else: # We have successfully started a (re)boot attempt so # record the timestamp so we can report how long it takes. self.state = states.BOOTING self.instance_info = instance_info # driver post boot hook self.driver.post_boot(worker_context)
def save_and_reraise_exception(): """Save current exception, run some code and then re-raise. In some cases the exception context can be cleared, resulting in None being attempted to be re-raised after an exception handler is run. This can happen when eventlet switches greenthreads or when running an exception handler, code raises and catches an exception. In both cases the exception context will be cleared. To work around this, we save the exception state, run handler code, and then re-raise the original exception. If another exception occurs, the saved exception is logged and the new exception is re-raised. """ type_, value, tb = sys.exc_info() try: yield except Exception: logging.error(_LE('Original exception being dropped: %s'), traceback.format_exception(type_, value, tb)) raise raise type_, value, tb
def _worker(inq, worker_factory): """Scheduler's worker process main function. """ daemon.ignore_signals() LOG.debug('starting worker process') worker = worker_factory() while True: try: data = inq.get() except IOError: # NOTE(dhellmann): Likely caused by a signal arriving # during processing, especially SIGCHLD. data = None if data is None: target, message = None, None else: target, message = data try: worker.handle_message(target, message) except Exception: LOG.exception(_LE('Error processing data %s'), unicode(data)) if data is None: break LOG.debug('exiting')
def __call__(self, req): try: if req.method != "PUT": return webob.exc.HTTPMethodNotAllowed() args = filter(None, req.path.split("/")) if not args: return webob.exc.HTTPNotFound() command, _, _ = self.ctl.command_manager.find_command(args) if command.interactive: return webob.exc.HTTPNotImplemented() return str(self.ctl.run(["--debug"] + args)) except SystemExit: # cliff invokes -h (help) on argparse failure # (which in turn results in sys.exit call) return webob.exc.HTTPBadRequest() except ValueError: return webob.exc.HTTPNotFound() except Exception: LOG.exception(_LE("Unexpected error.")) msg = _("An unknown error has occurred. " "Please try your request again.") return webob.exc.HTTPInternalServerError(explanation=unicode(msg))
def configure(self, worker_context, failure_state=states.RESTART, attempts=None): """Pushes config to instance :param worker_context: :param failure_state: :param attempts: :returns: """ self.log.debug('Begin instance config') self.state = states.UP attempts = attempts or cfg.CONF.max_retries self._ensure_cache(worker_context) if self.driver.get_state(worker_context) == states.GONE: return interfaces = self.driver.get_interfaces( self.instance_info.management_address) if not self._verify_interfaces(self.driver.ports, interfaces): # FIXME: Need a states.REPLUG state when we support hot-plugging # interfaces. self.log.debug("Interfaces aren't plugged as expected.") self.state = states.REPLUG return # TODO(mark): We're in the first phase of VRRP, so we need # map the interface to the network ID. # Eventually we'll send VRRP data and real interface data port_mac_to_net = { p.mac_address: p.network_id for p in self.instance_info.ports } # Add in the management port mgt_port = self.instance_info.management_port port_mac_to_net[mgt_port.mac_address] = mgt_port.network_id # this is a network to logical interface id iface_map = { port_mac_to_net[i['lladdr']]: i['ifname'] for i in interfaces if i['lladdr'] in port_mac_to_net } # sending all the standard config over to the driver for final updates config = self.driver.build_config( worker_context, mgt_port, iface_map ) self.log.debug('preparing to update config to %r', config) for i in xrange(attempts): try: self.driver.update_config( self.instance_info.management_address, config) except Exception: if i == attempts - 1: # Only log the traceback if we encounter it many times. self.log.exception(_LE('failed to update config')) else: self.log.debug( 'failed to update config, attempt %d', i ) time.sleep(cfg.CONF.retry_delay) else: self.state = states.CONFIGURED self.log.info('Instance config updated') return else: self.state = failure_state
def _dispatch_command(self, target, message): instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() # NOTE(adam_g): Drop 'router-debug' compat in M. elif (instructions['command'] == commands.RESOURCE_DEBUG or instructions['command'] == commands.ROUTER_DEBUG): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to debug resource with no id')) return reason = instructions.get('reason') if resource_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all resources with %r'), resource_id) else: LOG.info(_LI('Placing router %s in debug mode (reason: %s)'), resource_id, reason) self.db_api.enable_resource_debug(resource_id, reason) elif (instructions['command'] == commands.RESOURCE_MANAGE or instructions['command'] == commands.ROUTER_MANAGE): resource_id = (instructions.get('resource_id') or instructions.get('router_id')) if not resource_id: LOG.warning(_LW( 'Ignoring instruction to manage resource with no id')) return try: self.db_api.disable_resource_debug(resource_id) LOG.info(_LI('Resuming management of resource %s'), resource_id) except KeyError: pass try: self._resource_locks[resource_id].release() LOG.info(_LI('Unlocked resource %s'), resource_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in EVENT_COMMANDS: resource_id = instructions.get('resource_id') sm = self._find_state_machine_by_resource_id(resource_id) if not sm: LOG.debug( 'Will not process command, no managed state machine ' 'found for resource %s', resource_id) return new_res = event.Resource( id=resource_id, driver=sm.driver.RESOURCE_NAME, tenant_id=sm.tenant_id) new_msg = event.Event( resource=new_res, crud=EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_res) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_res) # NOTE(adam_g): This is here to support the deprecated old format of # sending commands to specific routers and can be # removed once the CLI component is dropped in M. elif instructions['command'] in DEPRECATED_ROUTER_COMMANDS: print 'XXX DEPR' new_rsc = event.Resource( driver=drivers.router.Router.RESOURCE_NAME, id=message.body.get('router_id'), tenant_id=message.body.get('tenant_id'), ) new_msg = event.Event( resource=new_rsc, crud=DEPRECATED_ROUTER_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], new_rsc) self.handle_message(new_msg.resource.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], new_rsc) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def _thread_target(self): """This method runs in each worker thread. """ my_id = threading.current_thread().name LOG.debug('starting thread') # Use a separate context from the one we use when receiving # messages and talking to the tenant router manager because we # are in a different thread and the clients are not # thread-safe. context = WorkerContext() while self._keep_going: try: # Try to get a state machine from the work queue. If # there's nothing to do, we will block for a while. self._thread_status[my_id] = 'waiting for task' sm = self.work_queue.get(timeout=10) except Queue.Empty: continue if sm is None: LOG.info(_LI('received stop message')) break # Make sure we didn't already have some updates under way # for a router we've been told to ignore for debug mode. should_ignore, reason = \ self.db_api.resource_in_debug(sm.resource_id) if should_ignore: LOG.debug('Skipping update of resource %s in debug mode. ' '(reason: %s)', sm.resource_id, reason) continue # FIXME(dhellmann): Need to look at the router to see if # it belongs to a tenant which is in debug mode, but we # don't have that data in the sm, yet. LOG.debug('performing work on %s for tenant %s', sm.resource_id, sm.tenant_id) try: self._thread_status[my_id] = 'updating %s' % sm.resource_id sm.update(context) except: LOG.exception(_LE('could not complete update for %s'), sm.resource_id) finally: self._thread_status[my_id] = ( 'finalizing task for %s' % sm.resource_id ) self.work_queue.task_done() with self.lock: # Release the lock that prevents us from adding # the state machine back into the queue. If we # find more work, we will re-acquire it. If we do # not find more work, we hold the primary work # queue lock so the main thread cannot put the # state machine back into the queue until we # release that lock. self._release_resource_lock(sm) # The state machine has indicated that it is done # by returning. If there is more work for it to # do, reschedule it by placing it at the end of # the queue. if sm.has_more_work(): LOG.debug('%s has more work, returning to work queue', sm.resource_id) self._add_resource_to_work_queue(sm) else: LOG.debug('%s has no more work', sm.resource_id) # Return the context object so tests can look at it self._thread_status[my_id] = 'exiting' return context
def get_state_machines(self, message, worker_context): """Return the state machines and the queue for sending it messages for the logical resource being addressed by the message. """ if (not message.resource or (message.resource and not message.resource.id)): LOG.error( _LE('Cannot get state machine for message with ' 'no message.resource')) raise InvalidIncomingMessage() state_machines = [] # Send to all of our resources. if message.resource.id == '*': LOG.debug('routing to all state machines') state_machines = self.state_machines.values() # Ignore messages to deleted resources. elif self.state_machines.has_been_deleted(message.resource.id): LOG.debug('dropping message for deleted resource') return [] # Send to resources that have an ERROR status elif message.resource.id == 'error': state_machines = [ sm for sm in self.state_machines.values() if sm.has_error() ] LOG.debug('routing to %d errored state machines', len(state_machines)) # Create a new state machine for this router. elif message.resource.id not in self.state_machines: LOG.debug('creating state machine for %s', message.resource.id) # load the driver if not message.resource.driver: LOG.error( _LE('cannot create state machine without specifying' 'a driver.')) return [] driver_obj = \ drivers.get(message.resource.driver)(worker_context, message.resource.id) if not driver_obj: # this means the driver didn't load for some reason.. # this might not be needed at all. LOG.debug('for some reason loading the driver failed') return [] def deleter(): self._delete_resource(message.resource.id) new_state_machine = state.Automaton( driver=driver_obj, resource_id=message.resource.id, tenant_id=self.tenant_id, delete_callback=deleter, bandwidth_callback=self._report_bandwidth, worker_context=worker_context, queue_warning_threshold=self._queue_warning_threshold, reboot_error_threshold=self._reboot_error_threshold, ) self.state_machines[message.resource.id] = new_state_machine state_machines = [new_state_machine] # Send directly to an existing router. elif message.resource.id: state_machines = [self.state_machines[message.resource.id]] # Filter out any deleted state machines. return [ machine for machine in state_machines if (not machine.deleted and not self.state_machines.has_been_deleted(machine.resource_id)) ]
def run_vsctl(self, args): full_args = ["ovs-vsctl", "--timeout=2"] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error(_LE("Unable to execute %(cmd)s. Exception: %(exception)s"), {"cmd": full_args, "exception": e})
def replug(self, worker_context): self.log.debug("Attempting to replug...") self._ensure_provider_ports(self.router_obj, worker_context) interfaces = router_api.get_interfaces(self.instance_info.management_address, cfg.CONF.akanda_mgt_service_port) actual_macs = set((iface["lladdr"] for iface in interfaces)) instance_macs = set(p.mac_address for p in self.instance_info.ports) instance_macs.add(self.instance_info.management_port.mac_address) if instance_macs != actual_macs: # our cached copy of the ports is wrong reboot and clean up self.log.warning( _LW("Instance macs(%s) do not match actual macs (%s). " "Instance cache appears out-of-sync"), instance_macs, actual_macs, ) self.state = RESTART return instance_ports = {p.network_id: p for p in self.instance_info.ports} instance_networks = set(instance_ports.keys()) logical_networks = set(p.network_id for p in self.router_obj.ports) if logical_networks != instance_networks: instance = worker_context.nova_client.get_instance_by_id(self.instance_info.id_) # For each port that doesn't have a mac address on the instance... for network_id in logical_networks - instance_networks: port = worker_context.neutron.create_vrrp_port(self.router_obj.id, network_id) self.log.debug("Net %s is missing from the router, plugging: %s", network_id, port.id) try: instance.interface_attach(port.id, None, None) except: self.log.exception(_LE("Interface attach failed")) self.state = RESTART return self.instance_info.ports.append(port) for network_id in instance_networks - logical_networks: port = instance_ports[network_id] self.log.debug("Net %s is detached from the router, unplugging: %s", network_id, port.id) try: instance.interface_detach(port.id) except: self.log.exception(_LE("Interface detach failed")) self.state = RESTART return self.instance_info.ports.remove(port) # The action of attaching/detaching interfaces in Nova happens via the # message bus and is *not* blocking. We need to wait a few seconds to # see if the list of tap devices on the appliance actually changed. If # not, assume the hotplug failed, and reboot the Instance. replug_seconds = cfg.CONF.hotplug_timeout while replug_seconds > 0: self.log.debug("Waiting for interface attachments to take effect...") interfaces = router_api.get_interfaces( self.instance_info.management_address, cfg.CONF.akanda_mgt_service_port ) if self._verify_interfaces(self.router_obj, interfaces): # replugging was successful # TODO(mark) update port states return time.sleep(1) replug_seconds -= 1 self.log.debug("Interfaces aren't plugged as expected, rebooting.") self.state = RESTART
def get_xapi_iface_id(self, xs_vif_uuid): args = ["xe", "vif-param-get", "param-name=other-config", "param-key=nicira-iface-id", "uuid=%s" % xs_vif_uuid] try: return utils.execute(args, root_helper=self.root_helper).strip() except Exception, e: LOG.error(_LE("Unable to execute %(cmd)s. Exception: %(exception)s"), {"cmd": args, "exception": e})
def run_ofctl(self, cmd, args): full_args = ["ovs-ofctl", cmd, self.br_name] + args try: return utils.execute(full_args, root_helper=self.root_helper) except Exception, e: LOG.error(_LE("Unable to execute %(cmd)s. Exception: %(exception)s"), {"cmd": full_args, "exception": e})
def _dispatch_command(self, target, message): instructions = message.body if instructions['command'] == commands.WORKERS_DEBUG: self.report_status() elif instructions['command'] == commands.ROUTER_DEBUG: router_id = instructions['router_id'] reason = instructions.get('reason') if router_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all routers with %r'), router_id) else: LOG.info(_LI('Placing router %s in debug mode (reason: %s)'), router_id, reason) self.db_api.enable_router_debug(router_id, reason) elif instructions['command'] == commands.ROUTER_MANAGE: router_id = instructions['router_id'] try: self.db_api.disable_router_debug(router_id) LOG.info(_LI('Resuming management of router %s'), router_id) except KeyError: pass try: self._router_locks[router_id].release() LOG.info(_LI('Unlocked router %s'), router_id) except KeyError: pass except threading.ThreadError: # Already unlocked, that's OK. pass elif instructions['command'] in self._EVENT_COMMANDS: new_msg = event.Event( tenant_id=message.tenant_id, router_id=message.router_id, crud=self._EVENT_COMMANDS[instructions['command']], body=instructions, ) # Use handle_message() to ensure we acquire the lock LOG.info(_LI('sending %s instruction to %s'), instructions['command'], message.tenant_id) self.handle_message(new_msg.tenant_id, new_msg) LOG.info(_LI('forced %s for %s complete'), instructions['command'], message.tenant_id) elif instructions['command'] == commands.TENANT_DEBUG: tenant_id = instructions['tenant_id'] reason = instructions.get('reason') if tenant_id in commands.WILDCARDS: LOG.warning(_LW( 'Ignoring instruction to debug all tenants with %r'), tenant_id) else: LOG.info(_LI('Placing tenant %s in debug mode (reason: %s)'), tenant_id, reason) self.db_api.enable_tenant_debug(tenant_id, reason) elif instructions['command'] == commands.TENANT_MANAGE: tenant_id = instructions['tenant_id'] try: self.db_api.disable_tenant_debug(tenant_id) LOG.info(_LI('Resuming management of tenant %s'), tenant_id) except KeyError: pass elif instructions['command'] == commands.GLOBAL_DEBUG: enable = instructions.get('enabled') reason = instructions.get('reason') if enable == 1: LOG.info('Enabling global debug mode (reason: %s)', reason) self.db_api.enable_global_debug(reason) elif enable == 0: LOG.info('Disabling global debug mode') self.db_api.disable_global_debug() else: LOG.warning('Unrecognized global debug command: %s', instructions) elif instructions['command'] == commands.CONFIG_RELOAD: try: cfg.CONF() except Exception: LOG.exception(_LE('Could not reload configuration')) else: cfg.CONF.log_opt_values(LOG, INFO) else: LOG.warning(_LW('Unrecognized command: %s'), instructions)
def get_state_machines(self, message, worker_context): """Return the state machines and the queue for sending it messages for the router being addressed by the message. """ router_id = message.router_id if not router_id: LOG.error(_LE('Cannot get state machine for message with ' 'no router_id')) raise InvalidIncomingMessage() # Ignore messages to deleted routers. if self.state_machines.has_been_deleted(router_id): LOG.debug('dropping message for deleted router') return [] state_machines = [] # Send to all of our routers. if router_id == '*': LOG.debug('routing to all state machines') state_machines = self.state_machines.values() # Send to routers that have an ERROR status elif router_id == 'error': state_machines = [ sm for sm in self.state_machines.values() if sm.has_error() ] LOG.debug('routing to %d errored state machines', len(state_machines)) # Create a new state machine for this router. elif router_id not in self.state_machines: LOG.debug('creating state machine for %s', router_id) def deleter(): self._delete_router(router_id) sm = state.Automaton( router_id=router_id, tenant_id=self.tenant_id, delete_callback=deleter, bandwidth_callback=self._report_bandwidth, worker_context=worker_context, queue_warning_threshold=self._queue_warning_threshold, reboot_error_threshold=self._reboot_error_threshold, ) self.state_machines[router_id] = sm state_machines = [sm] # Send directly to an existing router. elif router_id: sm = self.state_machines[router_id] state_machines = [sm] # Filter out any deleted state machines. return [ machine for machine in state_machines if (not machine.deleted and not self.state_machines.has_been_deleted(machine.router_id)) ]