def reboot(job): import time import redis from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token token = get_jwt_token(job.service.aysrepo) job.context['token'] = token service = job.service service._recurring_tasks['monitor'].stop() try: start = time.time() # Make sure any running monitor action finishes before we reboot while time.time() < start + 60: if not j.core.jobcontroller.db.jobs.list(actor='node.zero-os', action='monitor', state='running', service=service.name): break time.sleep(1) else: raise j.exceptions.RuntimeError( 'Failed to reboot node. Waiting for monitoring action for too long' ) force_reboot = service.model.data.forceReboot vms = service.consumers.get('vm') or [] for vm in vms: if vm.model.data.status != 'halted': if not force_reboot: raise j.exceptions.RuntimeError( 'Failed to reboot node. Force reboot is not enabled and some vms are not halted' ) else: vm.executeAction('shutdown', context=job.context) service.model.data.status = 'rebooting' job.logger.info('reboot node {}'.format(service)) node = Node.from_ays(service, job.context['token']) node.client.raw('core.reboot', {}) finally: start = time.time() while time.time() < start + 10: try: node = Node.from_ays(service, token, timeout=5) node.client.testConnectionAttempts = 0 node.client.ping() except (RuntimeError, ConnectionError, redis.TimeoutError, TimeoutError): break time.sleep(1) else: job.logger.info( "Could not wait within 10 seconds for node to reboot") service._recurring_tasks['monitor'].start()
def create_from_template_container(job, parent): """ if not it creates it. return the container service """ from zeroos.orchestrator.configuration import get_configuration from zeroos.orchestrator.sal.Container import Container from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) container_name = 'vdisk_{}_{}'.format(job.service.name, parent.name) node = Node.from_ays(parent, job.context['token']) config = get_configuration(job.service.aysrepo) flist = config.get( '0-disk-flist', 'https://hub.gig.tech/gig-official-apps/0-disk-master.flist') print("Creating container for flist: %s" % flist) container = Container(name=container_name, flist=flist, host_network=True, node=node) container.start() return container
def monitor(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token service = job.service if service.model.actionsState['install'] == 'ok': pservice = service.parent token = get_jwt_token(job.service.aysrepo) node = Node.from_ays(pservice, token) try: pool = node.storagepools.get(service.name) if not pool.mountpoint: job.context['token'] = token install(job) devices, status = pool.get_devices_and_status() service.model.data.init('devices', len(devices)) for i, device in enumerate(devices): service.model.data.devices[i] = device service.model.data.status = status service.saveAll() except ValueError: job.logger.error("pool %s doesn't exist, cant monitor pool", service.name)
def install(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) # at each boot recreate the complete state in the system service = job.service node = Node.from_ays(service, get_jwt_token(job.service.aysrepo)) get_version(job) job.logger.info('mount storage pool for fuse cache') poolname = '{}_fscache'.format(service.name) node.ensure_persistance(poolname) # Set host name node.client.system('hostname %s' % service.model.data.hostname).get() node.client.bash('echo %s > /etc/hostname' % service.model.data.hostname).get() job.logger.info('configure networks') for network in service.producers.get('network', []): network.executeAction('configure', args={'node_name': service.name}) stats_collector_service = get_stats_collector(service) statsdb_service = get_statsdb(service) if stats_collector_service and statsdb_service and statsdb_service.model.data.status == 'running': stats_collector_service.executeAction('install', context=job.context) node.client.bash('modprobe ipmi_si && modprobe ipmi_devintf').get()
def nic_shutdown(job, message): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token service = job.service node = Node.from_ays(service, get_jwt_token(service.aysrepo)) interface = message['name'] if interface.startswith('cont'): container_id = interface.split('-')[0].replace('cont', '') for container in node.containers.list(): if str(container.id) == container_id: container_service = service.aysrepo.serviceGet( role='container', instance=container.name) container_service.model.data.status = 'networkKilled' container_service.saveAll() return else: vms = node.client.kvm.list() for vm in vms: if interface in vm['ifctargets']: vm_service = service.aysrepo.serviceGet(role='vm', instance=vm['name']) vm_service.model.data.status = 'networkKilled' vm_service.saveAll() return job.logger.info('Failed to find vm/container interface matching %s' % interface)
def install(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token # at each boot recreate the complete state in the system service = job.service node = Node.from_ays(service, get_jwt_token(job.service.aysrepo)) job.logger.info("mount storage pool for fuse cache") poolname = "{}_fscache".format(service.name) node.ensure_persistance(poolname) # Set host name node.client.system("hostname %s" % service.model.data.hostname).get() node.client.bash("echo %s > /etc/hostname" % service.model.data.hostname).get() job.logger.info("configure networks") for network in service.producers.get('network', []): job = network.getJob('configure', args={'node_name': service.name}) j.tools.async.wrappers.sync(job.execute()) stats_collector_service = get_stats_collector(service) statsdb_service = get_statsdb(service) if stats_collector_service and statsdb_service and statsdb_service.model.data.status == 'running': j.tools.async.wrappers.sync(stats_collector_service.executeAction( 'install', context=job.context))
def get_filesystem(job): from zeroos.orchestrator.sal.Node import Node nodeservice = job.service.parent.parent.parent poolname = job.service.parent.parent.name fsname = str(job.ervice.parent.model.data.name) node = Node.from_ays(nodeservice, job.context['token']) pool = node.storagepools.get(poolname) return pool.get(fsname)
def get_pool(job): from zeroos.orchestrator.configuration import get_jwt_token from zeroos.orchestrator.sal.Node import Node job.context['token'] = get_jwt_token(job.service.aysrepo) nodeservice = job.service.parent.parent poolname = job.service.parent.name node = Node.from_ays(nodeservice, job.context['token']) return node.storagepools.get(poolname)
def _nbd_url(job, container, nbdserver, vdisk): from zeroos.orchestrator.sal.Node import Node container_root = container.info['container']['root'] node = Node.from_ays(nbdserver.parent.parent, password=job.context['token'])._client node.filesystem.mkdir("/var/run/nbd-servers/") endpoint = nbdserver.model.data.socketPath.lstrip('/') socket_path = j.sal.fs.joinPaths(container_root, endpoint) link = j.sal.fs.joinPaths("/var/run/nbd-servers/", endpoint) node.system("ln -s %s /var/run/nbd-servers/" % socket_path) return 'nbd+unix:///{id}?socket={socket}'.format(id=vdisk, socket=link)
def drop(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token service = job.service job.context['token'] = get_jwt_token(job.service.aysrepo) node = Node.from_ays(service.parent, job.context['token']) if not node.client.nft.rule_exists(service.model.data.port): return node.client.nft.drop_port(service.model.data.port) service.model.data.status = "dropped" service.saveAll()
def install(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) service = job.service pservice = service.parent node = Node.from_ays(pservice, job.context['token']) devices = [d.device for d in service.model.data.devices] name = service.name dataProfile = str(service.model.data.dataProfile) metadataProfile = str(service.model.data.metadataProfile) mountpoint = str(service.model.data.mountpoint) or None created = False try: pool = node.storagepools.get(name) except ValueError: # pool does not exists lets create it pool = node.storagepools.create(name, devices, metadataProfile, dataProfile, overwrite=True) created = True # mount device # if pool already mounted and user ask a specific mountpoint, remount to the correct location if pool.mountpoint and mountpoint: if pool.mountpoint != mountpoint: pool.umount() pool.mount(mountpoint) # if pool already mounted and not specific endpoint asked, do nothing if pool.mountpoint and not mountpoint: pass # if pool not mounted and no mountpoint specified, use automatic mount elif not pool.mountpoint and not mountpoint: pool.mount() # lets check if devices need to be added removed and the profile still matches if pool.fsinfo['data']['profile'].lower() != dataProfile: raise RuntimeError( "Data profile of storagepool {} does not match".format(name)) if pool.fsinfo['metadata']['profile'].lower() != metadataProfile: raise RuntimeError( "Metadata profile of storagepool {} does not match".format(name)) if not created: updateDevices(service, pool, devices) pool.ays.create(service.aysrepo)
def delete(job): from zeroos.orchestrator.sal.Node import Node service = job.service # Get node client node = Node.from_ays(service.parent, job.context['token']) if service.model.data.status == 'error': if service.name not in node.client.bridge.list(): return node.client.bridge.delete(service.name) service.model.data.status = 'down'
def delete(job): from zeroos.orchestrator.sal.Node import Node service = job.service pservice = service.parent node = Node.from_ays(pservice, job.context['token']) name = service.name try: pool = node.storagepools.get(name) pool.delete() except ValueError: # pool does not exists, nothing to do pass
def get_version(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token service = job.service if service.model.data.status != 'running': version = '' else: node = Node.from_ays(service, get_jwt_token(job.service.aysrepo)) pong = node.client.ping() version = pong.split('Version: ')[1] if pong else '' service.model.data.version = version service.saveAll() return version
def monitor(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token, get_configuration import redis service = job.service config = get_configuration(service.aysrepo) token = get_jwt_token(job.service.aysrepo) if service.model.actionsState['install'] != 'ok': return try: node = Node.from_ays(service, token, timeout=15) node.client.testConnectionAttempts = 0 state = node.client.ping() except RuntimeError: state = False except redis.ConnectionError: state = False if state: service.model.data.status = 'running' configured = isConfigured(node, service.name) if not configured: job = service.getJob('install', args={}) j.tools.async.wrappers.sync(job.execute()) job.context['token'] = token stats_collector_service = get_stats_collector(service) statsdb_service = get_statsdb(service) # Check if statsdb is installed on this node and start it if needed if (statsdb_service and str(statsdb_service.parent) == str(job.service) and statsdb_service.model.data.status != 'running'): j.tools.async.wrappers.sync(statsdb_service.executeAction( 'start', context=job.context)) # Check if there is a running statsdb and if so make sure stats_collector for this node is started if (stats_collector_service and stats_collector_service.model.data.status != 'running' and statsdb_service.model.data.status == 'running'): j.tools.async.wrappers.sync(stats_collector_service.executeAction( 'start', context=job.context)) else: service.model.data.status = 'halted' flist = config.get('healthcheck-flist', 'https://hub.gig.tech/deboeckj/js9container.flist') with node.healthcheck.with_container(flist) as cont: update_healthcheck(service, node.healthcheck.run(cont, 'openfiledescriptors')) service.saveAll()
def init(job): import random from zeroos.orchestrator.sal.Node import Node service = job.service nodes = set() for node_service in service.producers['node']: nodes.add(Node.from_ays(node_service, job.context['token'])) nodes = list(nodes) nodes = random.sample(nodes, service.model.data.size) etcd_actor = service.aysrepo.actorGet("etcd") container_actor = service.aysrepo.actorGet("container") etcd_args = {} peers = [] for node in nodes: baseports, tcpservices = get_baseports(job, node, baseport=2379, nrports=2) containername = '{}_{}_{}_{}'.format(service.name, 'etcd', node.name, baseports[1]) # create container args = { 'node': node.name, 'flist': 'https://hub.gig.tech/gig-official-apps/etcd-release-3.2.flist', 'hostNetworking': True, } container_actor.serviceCreate(instance=containername, args=args) server_bind = '{}:{}'.format(node.storageAddr, baseports[1]) client_bind = '{}:{}'.format(node.storageAddr, baseports[0]) etcd_args[node.name] = { "serverBind": server_bind, "clientBind": client_bind, "container": containername, "tcps": tcpservices } peers.append("{}_{}_{}=http://{}".format(service.name, node.name, baseports[1], server_bind)) for k, v in etcd_args.items(): tcps = v.pop("tcps") etcdname = "{}_{}_{}".format(service.name, k, tcps[1].model.data.port) v["peers"] = peers etcd_service = etcd_actor.serviceCreate(instance=etcdname, args=v) etcd_service.consume(tcps[0]) etcd_service.consume(tcps[1]) service.consume(etcd_service)
def get_baseports(job, node_service, node_sal, baseport, nrports, name=None): """ look for nrports free ports on node_service, starting from baseport it retuns 2 lists, - list of selected port, [int] - list of tcp ays services, [Service] """ service = job.service if node_sal is None: from zeroos.orchestrator.sal.Node import Node node_sal = Node.from_ays(node_service, job.context['token']) parent_str = "%s!%s" % (node_service.model.role, node_service.name) tcps = service.aysrepo.servicesFind(role='tcp', parent=parent_str) usedports = set() for tcp in tcps: usedports.add(tcp.model.data.port) freeports = [] tcpactor = service.aysrepo.actorGet("tcp") tcpservices = [] while True: if baseport not in usedports: port = node_sal.freeports(baseport=baseport, nrports=1) if not port: for ts in tcpservices: ts.delete() return None baseport = port[0] args = { 'node': node_service.name, 'port': baseport, } tcp = 'tcp_{}_{}'.format(node_service.name, baseport) if name: tcp = '{}_{}_{}'.format(name, node_service.name, baseport) ts = tcpactor.serviceCreate(instance=tcp, args=args) # Check for race condition tcps = service.aysrepo.servicesFind(role='tcp', parent=parent_str) if len(tcps) > 1: ts.delete() else: tcpservices.append(ts) freeports.append(baseport) if len(freeports) >= nrports: return freeports, tcpservices baseport += 1
def delete(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) service = job.service pservice = service.parent node = Node.from_ays(pservice, job.context['token']) name = service.name try: pool = node.storagepools.get(name) pool.delete(zero=True) except ValueError: # pool does not exists, nothing to do pass
def monitor(job): from zeroos.orchestrator.configuration import get_jwt_token from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.sal.Container import Container service = job.service if service.model.actionsState['install'] != 'ok' or service.parent.model.data.status != 'running': return token = get_jwt_token(job.service.aysrepo) node = Node.from_ays(service.parent, token, timeout=5) if not node.is_configured(): return container = Container.from_ays(job.service, token, logger=service.logger) running = container.is_running() if not running and service.model.data.status == 'running' and container.node.is_configured(service.parent.name): ovs_name = '{}_ovs'.format(container.node.name) if ovs_name != service.name: ovs_service = service.aysrepo.serviceGet(role='container', instance=ovs_name) ovs_container = Container.from_ays(ovs_service, token) if not ovs_container.is_running(): job.logger.warning\ ("Can't attempt to restart container {}, container {} is not running".format( service.name, ovs_name)) try: job.logger.warning("container {} not running, trying to restart".format(service.name)) service.model.dbobj.state = 'error' container.start() if container.is_running(): service.model.dbobj.state = 'ok' except: job.logger.error("can't restart container {} not running".format(service.name)) service.model.dbobj.state = 'error' elif running and service.model.data.status == 'halted': try: job.logger.warning("container {} running, trying to stop".format(service.name)) service.model.dbobj.state = 'error' container.stop() running, _ = container.is_running() if not running: service.model.dbobj.state = 'ok' except: job.logger.error("can't stop container {} is running".format(service.name)) service.model.dbobj.state = 'error'
def monitor(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token service = job.service if service.parent.model.data.status != 'running': return # Get node client token = get_jwt_token(job.service.aysrepo) node = Node.from_ays(service.parent, token) if service.model.data.status != 'up' or service.name in node.client.bridge.list(): return job.context['token'] = token install(job)
def _nbd_url(job, container, nbdserver, vdisk): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) container_root = container.info['container']['root'] node = Node.from_ays(nbdserver.parent.parent, password=job.context['token']).client node.filesystem.mkdir("/var/run/nbd-servers/") endpoint = nbdserver.model.data.socketPath.lstrip('/') socket_path = j.sal.fs.joinPaths(container_root, endpoint) link = j.sal.fs.joinPaths("/var/run/nbd-servers/", endpoint) result = node.system("ln -sf %s /var/run/nbd-servers/" % socket_path).get() if result.state.upper() == "ERROR": raise RuntimeError(result.stderr) return 'nbd+unix:///{id}?socket={socket}'.format(id=vdisk, socket=link)
def configure(job): """ For packet.net we just rename the public interface to storage so the rest of the config deals with it this method will be called from the node.zero-os install action. """ from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token nodeservice = job.service.aysrepo.serviceGet( role='node', instance=job.model.args['node_name']) node = Node.from_ays(nodeservice, get_jwt_token(job.service.aysrepo)) node.client.bash(""" pubint=$(ip route | grep default | awk '{print $5}') ip link set dev $pubint down ip link set dev $pubint name backplane ip link set dev backplane up udhcpc -i backplane -s /usr/share/udhcp/simple.script -q """).get()
def reboot(job): from zeroos.orchestrator.sal.Node import Node service = job.service # Check if statsdb is installed on this node and stop it statsdb_service = get_statsdb(service) if statsdb_service and str(statsdb_service.parent) == str(job.service): j.tools.async.wrappers.sync(statsdb_service.executeAction( 'stop', context=job.context)) # Chceck if stats_collector is installed on this node and stop it stats_collector_service = get_stats_collector(service) if stats_collector_service and stats_collector_service.model.data.status == 'running': j.tools.async.wrappers.sync(stats_collector_service.executeAction( 'stop', context=job.context)) job.logger.info("reboot node {}".format(service)) node = Node.from_ays(service, job.context['token']) node.client.raw('core.reboot', {})
def init(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token import re service = job.service job.context['token'] = get_jwt_token(service.aysrepo) for nic in service.model.data.nics: if nic.hwaddr: pattern = re.compile(r'^(?:[0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}$') if not pattern.match(nic.hwaddr): raise j.exceptions.Input('Hwaddr: string is not a valid mac address.') if nic.type == 'vlan': break else: return node = Node.from_ays(service.parent, job.context['token']) ovs_container = node.client.container.find('ovs') if not ovs_container: raise j.exceptions.Input('OVS container needed to run this blueprint')
def processChange(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token_from_job service = job.service if service.model.actionsState['install'] in ['new', 'scheduled']: return args = job.model.args category = args.pop('changeCategory') if category == "dataschema": pservice = service.parent node = Node.from_ays(pservice, get_jwt_token_from_job(job)) try: pool = node.storagepools.get(service.name) devices = [d['device'] for d in args['devices']] updateDevices(service, pool, devices) pool.ays.create(service.aysrepo) except ValueError: job.logger.error("pool %s doesn't exist, cant update devices", service.name)
def get_templatecluster(job): from urllib.parse import urlparse from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token job.context['token'] = get_jwt_token(job.service.aysrepo) service = job.service template = urlparse(service.model.data.templateVdisk) if template.scheme == 'ardb' and template.netloc: return template.netloc node_srv = [node for node in service.aysrepo.servicesFind(role="node") if node.model.data.status != "halted"] if len(node_srv): node_srv = node_srv[0] else: raise RuntimeError("No running nodes found") node = Node.from_ays(node_srv, password=job.context['token']) conf = node.client.config.get() return urlparse(conf['globals']['storage']).netloc
def cleanupzerodisk(job): from zeroos.orchestrator.sal.Node import Node service = job.service node = Node.from_ays(service.parent, password=job.context['token']) for nbdserver in service.producers.get('nbdserver', []): job.logger.info("stop nbdserver for vm {}".format(service.name)) # make sure the nbdserver is stopped j.tools. async .wrappers.sync( nbdserver.executeAction('stop', context=job.context)) for tlogserver in service.producers.get('tlogserver', []): job.logger.info("stop tlogserver for vm {}".format(service.name)) # make sure the tlogserver is stopped j.tools. async .wrappers.sync( tlogserver.executeAction('stop', context=job.context)) job.logger.info("stop vdisks container for vm {}".format(service.name)) try: container_name = 'vdisks_{}_{}'.format(service.name, service.parent.name) container = service.aysrepo.serviceGet(role='container', instance=container_name) j.tools. async .wrappers.sync( container.executeAction('stop', context=job.context)) j.tools. async .wrappers.sync(container.delete()) except j.exceptions.NotFound: job.logger.info("container doesn't exists.") service.model.data.status = 'halted' node = get_node(job) vnc = service.model.data.vnc if vnc != -1: node.client.nft.drop_port(vnc) service.model.data.vnc = -1 service.saveAll()
def install(job): from zeroos.orchestrator.sal.Node import Node service = job.service # Get g8core client node = Node.from_ays(service.parent, job.context['token']) # Create bridge network = None if str(service.model.data.networkMode) == "none" else str( service.model.data.networkMode) try: node.client.bridge.create( service.name, hwaddr=service.model.data.hwaddr or None, network=network, nat=service.model.data.nat, settings=service.model.data.setting.to_dict()) except RuntimeError as e: service.model.data.status = 'error' raise e service.model.data.status = 'up'
def init(job): from zeroos.orchestrator.sal.Node import Node from zeroos.orchestrator.configuration import get_jwt_token service = job.service node = Node.from_ays(service, get_jwt_token(service.aysrepo)) job.logger.info("create storage pool for fuse cache") poolname = "{}_fscache".format(service.name) storagepool = node.ensure_persistance(poolname) storagepool.ays.create(service.aysrepo) statsdb_service = get_statsdb(service) if statsdb_service: stats_collector_actor = service.aysrepo.actorGet('stats_collector') args = { 'node': service.name, 'port': statsdb_service.model.data.port, 'ip': statsdb_service.parent.model.data.redisAddr, } stats_collector_service = stats_collector_actor.serviceCreate(instance=service.name, args=args) stats_collector_service.consume(service)
def cleanupzerodisk(job): from zeroos.orchestrator.configuration import get_jwt_token from zeroos.orchestrator.sal.Node import Node from zeroos.core0.client import ResultError job.context['token'] = get_jwt_token(job.service.aysrepo) service = job.service node = Node.from_ays(service.parent, password=job.context['token']) for nbdserver_service in service.producers.get('nbdserver', []): job.logger.info("stop nbdserver for vm {}".format(service.name)) # make sure the nbdserver is stopped nbdserver_service.executeAction('stop', context=job.context) # make sure the container is stopped nbdserver_service.parent.executeAction('stop', context=job.context) for tlogserver in service.producers.get('tlogserver', []): job.logger.info("stop tlogserver for vm {}".format(service.name)) # make sure the tlogserver is stopped tlogserver.executeAction('stop', context=job.context) # make sure the container is stopped tlogserver.parent.executeAction('stop', context=job.context) service.model.data.status = 'halted' node = get_node(job) vnc = service.model.data.vnc if vnc != -1: try: node.client.nft.drop_port(vnc) except ResultError as e: if e.message != '"rule does not exist"': raise e service.model.data.vnc = -1 service.saveAll()