def run(self): workers = [] LOG.info('Starting Queues') libvirt = util.get_libvirt() conn = libvirt.open(None) present_cpus, _, _ = conn.getCPUMap() while True: try: for w in copy.copy(workers): if not w.is_alive(): w.join(1) workers.remove(w) if len(workers) < present_cpus / 2: jobname, workitem = db.dequeue(config.NODE_NAME) else: workitem = None if not workitem: time.sleep(0.2) continue p = multiprocessing.Process( target=handle, args=(jobname, workitem,), name='%s-worker' % daemon.process_name('queues')) p.start() workers.append(p) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e)
def run(self): LOG.info('Starting') last_loop_run = 0 while not self.exit.is_set(): setproctitle.setproctitle(daemon.process_name('cluster') + ' idle') self._await_election() if self.is_elected and not self.exit.is_set(): setproctitle.setproctitle( daemon.process_name('cluster') + ' active') self.lock.refresh() self._cluster_wide_cleanup(last_loop_run) last_loop_run = time.time() self.lock.refresh() self.exit.wait(60)
def run(self): logutil.info(None, 'Starting') observers = {} while True: # Cleanup terminated observers all_observers = list(observers.keys()) for instance_uuid in all_observers: if not observers[instance_uuid].is_alive(): # Reap process observers[instance_uuid].join(1) logutil.info([virt.ThinInstance(instance_uuid)], 'Trigger observer has terminated') db.add_event('instance', instance_uuid, 'trigger monitor', 'crashed', None, None) del observers[instance_uuid] # Start missing observers extra_instances = list(observers.keys()) for inst in db.get_instances( only_node=config.parsed.get('NODE_NAME')): if inst['uuid'] in extra_instances: extra_instances.remove(inst['uuid']) if inst['state'] != 'created': continue if inst['uuid'] not in observers: console_path = os.path.join( config.parsed.get('STORAGE_PATH'), 'instances', inst['uuid'], 'console.log') p = multiprocessing.Process( target=observe, args=(console_path, inst['uuid']), name='%s-%s' % (daemon.process_name('triggers'), inst['uuid'])) p.start() observers[inst['uuid']] = p logutil.info([virt.ThinInstance(inst['uuid'])], 'Started trigger observer') db.add_event('instance', inst['uuid'], 'trigger monitor', 'started', None, None) # Cleanup extra observers for instance_uuid in extra_instances: p = observers[instance_uuid] try: os.kill(p.pid, signal.SIGKILL) except Exception: pass del observers[instance_uuid] logutil.info([virt.ThinInstance(instance_uuid)], 'Finished trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'finished', None, None) time.sleep(1)
def observe(path, instance_uuid): setproctitle.setproctitle('%s-%s' % (daemon.process_name('triggers'), instance_uuid)) regexps = {'login prompt': ['^.* login: .*', re.compile('.* login: .*')]} while not os.path.exists(path): time.sleep(1) fd = os.open(path, os.O_RDONLY | os.O_NONBLOCK) logutil.info([virt.ThinInstance(instance_uuid)], 'Monitoring %s for triggers' % path) db.add_event('instance', instance_uuid, 'trigger monitor', 'detected console log', None, None) os.lseek(fd, 0, os.SEEK_END) buffer = '' while True: d = os.read(fd, 1024).decode('utf-8') if d: buffer += d lines = buffer.split('\n') buffer = lines[-1] for line in lines: if line: for trigger in regexps: m = regexps[trigger][1].match(line) if m: logutil.info([virt.ThinInstance(instance_uuid)], 'Trigger %s matched' % trigger) db.add_event('instance', instance_uuid, 'trigger', None, None, trigger) time.sleep(1)
def run(self): LOG.info('Starting') util.execute(None, (config.get('API_COMMAND_LINE') % { 'port': config.get('API_PORT'), 'timeout': config.get('API_TIMEOUT'), 'name': daemon.process_name('api') }), env_variables=os.environ)
def run(self): LOG.info('Starting') libvirt = util_libvirt.get_libvirt() conn = libvirt.open('qemu:///system') present_cpus, _, _ = conn.getCPUMap() os.makedirs('/var/run/sf', exist_ok=True) util_process.execute(None, (config.API_COMMAND_LINE % { 'port': config.API_PORT, 'timeout': config.API_TIMEOUT, 'name': daemon.process_name('api'), 'workers': present_cpus * 4 }), env_variables=os.environ, check_exit_code=[0, 1, -15])
def observe(path, instance_uuid): setproctitle.setproctitle('%s-%s' % (daemon.process_name('triggers'), instance_uuid)) regexps = {'login prompt': ['^.* login: .*', re.compile('.* login: .*')]} while not os.path.exists(path): time.sleep(1) fd = os.open(path, os.O_RDONLY | os.O_NONBLOCK) log_ctx = LOG.withInstance(instance_uuid) log_ctx.withField('path', path).info('Monitoring path for triggers') db.add_event('instance', instance_uuid, 'trigger monitor', 'detected console log', None, None) # Sometimes the trigger process is slow to start, so rewind 4KB to ensure # that the last few log lines are not missed. (4KB since Cloud-Init can be # noisy after the login prompt.) os.lseek(fd, max(0, os.fstat(fd).st_size - 4096), os.SEEK_SET) buffer = '' while True: d = os.read(fd, 1024).decode('utf-8', errors='ignore') if d: buffer += d lines = buffer.split('\n') buffer = lines[-1] for line in lines: if line: for trigger in regexps: m = regexps[trigger][1].match(line) if m: log_ctx.withField( 'trigger', trigger, ).info('Trigger matched') db.add_event('instance', instance_uuid, 'trigger', None, None, trigger) else: # Only pause if there was no data to read time.sleep(1)
def main(): global DAEMON_IMPLEMENTATIONS global DAEMON_PIDS setproctitle.setproctitle(daemon.process_name('main')) # Log configuration on startup for key, value in config.dict().items(): LOG.info('Configuration item %s = %s' % (key, value)) daemon.set_log_level(LOG, 'main') # Check in early and often, also reset processing queue items db.clear_stale_locks() db.see_this_node() db.restart_queues() def _start_daemon(d): pid = os.fork() if pid == 0: DAEMON_IMPLEMENTATIONS[d].Monitor(d).run() DAEMON_PIDS[pid] = d LOG.withField('pid', pid).info('Started %s' % d) # Resource usage publisher, we need this early because scheduling decisions # might happen quite early on. _start_daemon('resources') # If I am the network node, I need some setup if util.is_network_node(): # Bootstrap the floating network in the Networks table floating_network = db.get_network('floating') if not floating_network: db.create_floating_network(config.get('FLOATING_NETWORK')) floating_network = net.from_db('floating') subst = { 'physical_bridge': util.get_safe_interface_name('phy-br-%s' % config.get('NODE_EGRESS_NIC')), 'physical_nic': config.get('NODE_EGRESS_NIC') } if not util.check_for_interface(subst['physical_bridge']): # NOTE(mikal): Adding the physical interface to the physical bridge # is considered outside the scope of the orchestration software as # it will cause the node to lose network connectivity. So instead # all we do is create a bridge if it doesn't exist and the wire # everything up to it. We can do egress NAT in that state, even if # floating IPs don't work. with util.RecordedOperation('create physical bridge', None): # No locking as read only ipm = db.get_ipmanager('floating') subst['master_float'] = ipm.get_address_at_index(1) subst['netmask'] = ipm.netmask util.create_interface(subst['physical_bridge'], 'bridge', '') util.execute(None, 'ip link set %(physical_bridge)s up' % subst) util.execute( None, 'ip addr add %(master_float)s/%(netmask)s ' 'dev %(physical_bridge)s' % subst) util.execute( None, 'iptables -A FORWARD -o %(physical_nic)s ' '-i %(physical_bridge)s -j ACCEPT' % subst) util.execute( None, 'iptables -A FORWARD -i %(physical_nic)s ' '-o %(physical_bridge)s -j ACCEPT' % subst) util.execute( None, 'iptables -t nat -A POSTROUTING ' '-o %(physical_nic)s -j MASQUERADE' % subst) def _audit_daemons(): running_daemons = [] for pid in DAEMON_PIDS: running_daemons.append(DAEMON_PIDS[pid]) for d in DAEMON_IMPLEMENTATIONS: if d not in running_daemons: _start_daemon(d) for d in DAEMON_PIDS: if not psutil.pid_exists(d): LOG.warning('%s pid is missing, restarting' % DAEMON_PIDS[d]) _start_daemon(DAEMON_PIDS[d]) _audit_daemons() restore_instances() while True: time.sleep(10) wpid, _ = os.waitpid(-1, os.WNOHANG) while wpid != 0: LOG.warning('%s died (pid %d)' % (DAEMON_PIDS.get(wpid, 'unknown'), wpid)) del DAEMON_PIDS[wpid] wpid, _ = os.waitpid(-1, os.WNOHANG) _audit_daemons() db.see_this_node()
def handle(jobname, workitem): log = LOG.withField('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if (InstanceTask.__subclasscheck__(type(task)) or isinstance(task, FetchImageTask)): instance_uuid = task.instance_uuid() if instance_uuid: log_i = log.withInstance(instance_uuid) else: log_i = log log_i.withField('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if instance_uuid: # TODO(andy) move to QueueTask db.add_event('instance', instance_uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), instance_uuid) elif isinstance(task, PreflightInstanceTask): redirect_to = instance_preflight(instance_uuid, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): instance_start(instance_uuid, task.network()) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'deleted') except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) elif isinstance(task, ErrorInstanceTask): try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, 'error') if task.error_msg(): db.update_instance_error_message( instance_uuid, task.error_msg()) db.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) else: log_i.withField('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) if instance_uuid: db.enqueue_instance_error(instance_uuid, 'failed queue task: %s' % e) finally: db.resolve(config.NODE_NAME, jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) log.info('Completed workitem')
def handle(jobname, workitem): libvirt = util_libvirt.get_libvirt() log = LOG.with_field('workitem', jobname) log.info('Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) inst = None task = None try: for task in workitem.get('tasks', []): if not QueueTask.__subclasscheck__(type(task)): raise exceptions.UnknownTaskException( 'Task was not decoded: %s' % task) if InstanceTask.__subclasscheck__(type(task)): inst = instance.Instance.from_db(task.instance_uuid()) if not inst: raise exceptions.InstanceNotInDBException( task.instance_uuid()) if isinstance(task, FetchImageTask): inst = instance.Instance.from_db(task.instance_uuid()) if isinstance(task, SnapshotTask): inst = instance.Instance.from_db(task.instance_uuid()) if inst: log_i = log.with_instance(inst) else: log_i = log log_i.with_field('task_name', task.name()).info('Starting task') # TODO(andy) Should network events also come through here eventually? # Then this can be generalised to record events on networks/instances # TODO(andy) This event should be recorded when it is recorded as # dequeued in the DB. Currently it's reporting action on the item # and calling it 'dequeue'. if inst: # TODO(andy) move to QueueTask db.add_event('instance', inst.uuid, task.pretty_task_name(), 'dequeued', None, 'Work item %s' % jobname) if isinstance(task, FetchImageTask): image_fetch(task.url(), inst) elif isinstance(task, PreflightInstanceTask): if (inst.state.value == dbo.STATE_DELETED or inst.state.value.endswith('-error')): log_i.warning( 'You cannot preflight an instance in state %s, skipping task' % inst.state.value) continue redirect_to = instance_preflight(inst, task.network()) if redirect_to: log_i.info('Redirecting instance start to %s' % redirect_to) etcd.enqueue(redirect_to, workitem) return elif isinstance(task, StartInstanceTask): if (inst.state.value == dbo.STATE_DELETED or inst.state.value.endswith('-error')): log_i.warning( 'You cannot start an instance in state %s, skipping task' % inst.state.value) continue instance_start(inst, task.network()) etcd.enqueue('%s-metrics' % config.NODE_NAME, {}) elif isinstance(task, DeleteInstanceTask): try: instance_delete(inst) etcd.enqueue('%s-metrics' % config.NODE_NAME, {}) except Exception as e: util_general.ignore_exception( 'instance %s delete task' % inst, e) elif isinstance(task, FloatNetworkInterfaceTask): # Just punt it to the network node now that the interface is ready etcd.enqueue('networknode', task) elif isinstance(task, SnapshotTask): snapshot(inst, task.disk(), task.artifact_uuid(), task.blob_uuid()) elif isinstance(task, DeleteNetworkWhenClean): # Check if any interfaces remain on network task_network = net.Network.from_db(task.network_uuid()) ifaces = networkinterface.interfaces_for_network(task_network) cur_interfaces = {i.uuid: i for i in ifaces} if cur_interfaces: LOG.with_network(task_network).error( 'During DeleteNetworkWhenClean new interfaces have ' 'connected to network: %s', cur_interfaces) # Only check those present at delete task initiation time. remain_interfaces = list( set(task.wait_interfaces()) & set(cur_interfaces)) if remain_interfaces: # Queue task on a node with a remaining instance first_iface = cur_interfaces[remain_interfaces[0]] inst = instance.Instance.from_db(first_iface.instance_uuid) etcd.enqueue(inst.placement['node'], { 'tasks': [ DeleteNetworkWhenClean(task.network_uuid(), remain_interfaces) ] }, delay=60) else: # All original instances deleted, safe to delete network etcd.enqueue('networknode', DestroyNetworkTask(task.network_uuid())) elif isinstance(task, HypervisorDestroyNetworkTask): n = net.Network.from_db(task.network_uuid()) n.delete_on_hypervisor() elif isinstance(task, FetchBlobTask): metrics = etcd.get('metrics', config.NODE_NAME, None) if metrics: metrics = metrics.get('metrics', {}) else: metrics = {} b = blob.Blob.from_db(task.blob_uuid()) if not b: log.with_fields({ 'blob': task.blob_uuid() }).info('Cannot replicate blob, not found') elif (int(metrics.get('disk_free_blobs', 0)) - int(b.size) < config.MINIMUM_FREE_DISK): log.with_fields({ 'blob': task.blob_uuid() }).info('Cannot replicate blob, insufficient space') else: log.with_object(b).info('Replicating blob') size = b.ensure_local([]) log.with_object(b).with_fields({ 'transferred': size, 'expected': b.size }).info('Replicating blob complete') else: log_i.with_field('task', task).error('Unhandled task - dropped') log_i.info('Task complete') except exceptions.ImageFetchTaskFailedException as e: # Usually caused by external issue and not an application error log.info('Fetch Image Error: %s', e) if inst: inst.enqueue_delete_due_error('Image fetch failed: %s' % e) except exceptions.ImagesCannotShrinkException as e: log.info('Fetch Resize Error: %s', e) if inst: inst.enqueue_delete_due_error('Image resize failed: %s' % e) except libvirt.libvirtError as e: log.info('Libvirt Error: %s', e) if inst: inst.enqueue_delete_due_error('Instance task failed: %s' % e) except exceptions.InstanceException as e: log.info('Instance Error: %s', e) if inst: inst.enqueue_delete_due_error('Instance task failed: %s' % e) except Exception as e: # Logging ignored exception - this should be investigated util_general.ignore_exception('queue worker', e) if inst: inst.enqueue_delete_due_error('Failed queue task: %s' % e) finally: etcd.resolve(config.NODE_NAME, jobname) if inst: inst.add_event('tasks complete', 'dequeued', msg='Work item %s' % jobname) log.info('Completed workitem')
def run(self): LOG.info('Starting') observers = {} while not self.exit.is_set(): # Cleanup terminated observers all_observers = list(observers.keys()) for instance_uuid in all_observers: if not observers[instance_uuid].is_alive(): # Reap process observers[instance_uuid].join(1) LOG.with_instance(instance_uuid).info( 'Trigger observer has terminated') db.add_event('instance', instance_uuid, 'trigger monitor', 'crashed', None, None) del observers[instance_uuid] # Audit desired observers extra_instances = list(observers.keys()) missing_instances = [] with etcd.ThreadLocalReadOnlyCache(): for inst in instance.Instances([ instance.this_node_filter, partial(baseobject.state_filter, [instance.Instance.STATE_CREATED]) ]): if inst.uuid in extra_instances: extra_instances.remove(inst.uuid) if inst.uuid not in observers: missing_instances.append(inst.uuid) # Start missing observers for instance_uuid in missing_instances: console_path = os.path.join(config.STORAGE_PATH, 'instances', instance_uuid, 'console.log') p = multiprocessing.Process( target=observe, args=(console_path, instance_uuid), name='%s-%s' % (daemon.process_name('triggers'), instance_uuid)) p.start() observers[instance_uuid] = p LOG.with_instance(instance_uuid).info( 'Started trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'started', None, None) # Cleanup extra observers for instance_uuid in extra_instances: p = observers[instance_uuid] try: os.kill(p.pid, signal.SIGKILL) observers[instance_uuid].join(1) except Exception: pass del observers[instance_uuid] LOG.with_instance(instance_uuid).info( 'Finished trigger observer') db.add_event('instance', instance_uuid, 'trigger monitor', 'finished', None, None) self.exit.wait(1) # No longer running, clean up all trigger deaemons for instance_uuid in observers: os.kill(observers[instance_uuid].pid, signal.SIGKILL)
def observe(path, instance_uuid): setproctitle.setproctitle('%s-%s' % (daemon.process_name('triggers'), instance_uuid)) regexps = { 'login prompt': re.compile('.* login: .*'), 'user-data script start': re.compile('.*Starting.*Execute cloud user/final scripts.*'), 'user-data script end': re.compile('.*Finished.*Execute cloud user/final scripts.*'), 'cloud-init complete': re.compile('.*Reached target.*Cloud-init target.*') } while not os.path.exists(path): time.sleep(1) fd = os.open(path, os.O_RDONLY | os.O_NONBLOCK) log_ctx = LOG.with_instance(instance_uuid) log_ctx.with_field('path', path).info('Monitoring path for triggers') db.add_event('instance', instance_uuid, 'trigger monitor', 'detected console log', None, None) # Sometimes the trigger process is slow to start, so rewind 4KB to ensure # that the last few log lines are not missed. (4KB since Cloud-Init can be # noisy after the login prompt.) os.lseek(fd, max(0, os.fstat(fd).st_size - 4096), os.SEEK_SET) # Record how long the file is, because we need to detect truncations and # re-open. previous_size = os.stat(path).st_size buffer = '' while True: # Detect file truncations, and die if we see one. We will be restarted # by the monitor process. if not os.path.exists(path): return size = os.stat(path).st_size if size < previous_size: return previous_size = size # Read data, os.read() is non-blocking by the way. d = os.read(fd, 1024).decode('utf-8', errors='ignore') if d: buffer += d lines = buffer.split('\n') buffer = lines[-1] for line in lines: if line: for trigger in regexps: m = regexps[trigger].match(line) if m: log_ctx.with_field( 'trigger', trigger, ).info('Trigger matched') db.add_event('instance', instance_uuid, 'trigger', None, None, trigger) else: # Only pause if there was no data to read time.sleep(0.2)
def handle(jobname, workitem): j = JobName(jobname) logutil.info([j], 'Processing workitem') setproctitle.setproctitle('%s-%s' % (daemon.process_name('queues'), jobname)) instance_uuid = None task = None try: for task in workitem.get('tasks', []): ro = [j] instance_uuid = task.get('instance_uuid') if instance_uuid: i = virt.from_db(instance_uuid) ro.append(i) if task.get('type').startswith('instance_') and not instance_uuid: logutil.error(ro, 'Instance task lacks instance uuid') return if instance_uuid: db.add_event('instance', instance_uuid, task.get('type').replace('_', ' '), 'dequeued', None, 'Work item %s' % jobname) logutil.info( ro, 'Executing task %s: %s' % (task.get('type', 'unknown'), task)) if task.get('type') == 'image_fetch': image_fetch(task.get('url'), instance_uuid) if task.get('type') == 'instance_preflight': redirect_to = instance_preflight(instance_uuid, task.get('network')) if redirect_to: util.log('info', ro, 'Redirecting instance start to %s' % redirect_to) db.place_instance(instance_uuid, redirect_to) db.enqueue(redirect_to, workitem) return if task.get('type') == 'instance_start': instance_start(instance_uuid, task.get('network')) db.update_instance_state(instance_uuid, 'created') db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) if task.get('type') == 'instance_delete': try: instance_delete(instance_uuid) db.update_instance_state(instance_uuid, task.get('next_state', 'unknown')) if task.get('next_state_message'): db.update_instance_error_message( instance_uuid, task.get('next_state_message')) db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {}) except Exception as e: util.ignore_exception(daemon.process_name('queues'), e) except Exception as e: if instance_uuid: util.ignore_exception(daemon.process_name('queues'), e) db.enqueue_instance_delete(config.parsed.get('NODE_NAME'), instance_uuid, 'error', 'failed queue task: %s' % e) finally: db.resolve(config.parsed.get('NODE_NAME'), jobname) if instance_uuid: db.add_event('instance', instance_uuid, 'tasks complete', 'dequeued', None, 'Work item %s' % jobname) logutil.info([j], 'Completed workitem')
def main(): global DAEMON_IMPLEMENTATIONS global DAEMON_PIDS LOG.info('Starting...') setproctitle.setproctitle( daemon.process_name('main') + '-v%s' % util_general.get_version()) # If you ran this, it means we're not shutting down any more n = Node.new(config.NODE_NAME, config.NODE_MESH_IP) n.state = Node.STATE_CREATED # Log configuration on startup for key, value in config.dict().items(): LOG.info('Configuration item %s = %s' % (key, value)) daemon.set_log_level(LOG, 'main') # Check in early and often, also reset processing queue items. etcd.clear_stale_locks() Node.observe_this_node() etcd.restart_queues() def _start_daemon(d): pid = os.fork() if pid == 0: try: DAEMON_IMPLEMENTATIONS[d].Monitor(d).run() sys.exit(0) except Exception as e: util_general.ignore_exception('daemon creation', e) sys.exit(1) DAEMON_PIDS[pid] = d LOG.with_field('pid', pid).info('Started %s' % d) # Resource usage publisher, we need this early because scheduling decisions # might happen quite early on. _start_daemon('resources') # If I am the network node, I need some setup if config.NODE_IS_NETWORK_NODE: # Bootstrap the floating network in the Networks table floating_network = net.Network.from_db('floating') if not floating_network: floating_network = net.Network.create_floating_network( config.FLOATING_NETWORK) subst = { 'egress_bridge': util_network.get_safe_interface_name( 'egr-br-%s' % config.NODE_EGRESS_NIC), 'egress_nic': config.NODE_EGRESS_NIC } if not util_network.check_for_interface(subst['egress_bridge']): # NOTE(mikal): Adding the physical interface to the physical bridge # is considered outside the scope of the orchestration software as # it will cause the node to lose network connectivity. So instead # all we do is create a bridge if it doesn't exist and the wire # everything up to it. We can do egress NAT in that state, even if # floating IPs don't work. with util_general.RecordedOperation('create physical bridge', None): # No locking as read only ipm = IPManager.from_db('floating') subst['master_float'] = ipm.get_address_at_index(1) subst['netmask'] = ipm.netmask # We need to copy the MTU of the interface we are bridging to # or weird networking things happen. mtu = util_network.get_interface_mtu(config.NODE_EGRESS_NIC) util_network.create_interface( subst['egress_bridge'], 'bridge', '', mtu=mtu) util_process.execute(None, 'ip link set %(egress_bridge)s up' % subst) util_process.execute(None, 'ip addr add %(master_float)s/%(netmask)s ' 'dev %(egress_bridge)s' % subst) util_process.execute(None, 'iptables -A FORWARD -o %(egress_nic)s ' '-i %(egress_bridge)s -j ACCEPT' % subst) util_process.execute(None, 'iptables -A FORWARD -i %(egress_nic)s ' '-o %(egress_bridge)s -j ACCEPT' % subst) util_process.execute(None, 'iptables -t nat -A POSTROUTING ' '-o %(egress_nic)s -j MASQUERADE' % subst) def _audit_daemons(): running_daemons = [] for pid in DAEMON_PIDS: running_daemons.append(DAEMON_PIDS[pid]) for d in DAEMON_IMPLEMENTATIONS: if d not in running_daemons: _start_daemon(d) for d in list(DAEMON_PIDS): if not psutil.pid_exists(d): LOG.warning('%s pid is missing, restarting' % DAEMON_PIDS[d]) _start_daemon(DAEMON_PIDS[d]) _audit_daemons() restore_instances() running = True while True: time.sleep(5) try: wpid, _ = os.waitpid(-1, os.WNOHANG) while wpid != 0: LOG.warning('%s exited (pid %d)' % (DAEMON_PIDS.get(wpid, 'unknown'), wpid)) if wpid in DAEMON_PIDS: del DAEMON_PIDS[wpid] wpid, _ = os.waitpid(-1, os.WNOHANG) except ChildProcessError: # We get this if there are no child processes pass n = Node.from_db(config.NODE_NAME) if n.state.value not in [Node.STATE_STOPPING, Node.STATE_STOPPED]: _audit_daemons() Node.observe_this_node() elif len(DAEMON_PIDS) == 0: n.state = Node.STATE_STOPPED return else: if running: for pid in DAEMON_PIDS: try: os.kill(pid, signal.SIGTERM) LOG.info('Sent SIGTERM to %s (pid %s)' % (DAEMON_PIDS.get(pid, 'unknown'), pid)) except OSError as e: LOG.warn('Failed to send SIGTERM to %s: %s' % (pid, e)) running = False