def get_instance_console(vm): # RAPI GetInstanceConsole() returns endpoints to the vnc_bind_address, # which is a cluster-wide setting, either 0.0.0.0 or 127.0.0.1, and pretty # useless (see #783). # # Until this is fixed on the Ganeti side, construct a console info reply # directly. # # WARNING: This assumes that VNC runs on port network_port on # the instance's primary node, and is probably # hypervisor-specific. # log.debug("Getting console for vm %s", vm) console = {} console['kind'] = 'vnc' with pooled_rapi_client(vm) as client: i = client.GetInstance(vm.backend_vm_id) if vm.backend.hypervisor == "kvm" and i['hvparams']['serial_console']: raise Exception("hv parameter serial_console cannot be true") console['host'] = i['pnode'] console['port'] = i['network_port'] return console
def get_ganeti_nodes(backend=None, bulk=False): nodes = [] for backend in get_backends(backend): with pooled_rapi_client(backend) as client: nodes.append(client.GetNodes(bulk=bulk)) return reduce(list.__add__, nodes, [])
def connect_to_network(vm, nic): network = nic.network backend = vm.backend bnet, depend_jobs = ensure_network_is_active(backend, network.id) depends = create_job_dependencies(depend_jobs) nic = {'name': nic.backend_uuid, 'network': network.backend_id, 'ip': nic.ipv4_address} log.debug("Adding NIC %s to VM %s", nic, vm) kwargs = { "instance": vm.backend_vm_id, "nics": [("add", "-1", nic)], "depends": depends, } if vm.backend.use_hotplug(): kwargs["hotplug_if_possible"] = True if settings.TEST: kwargs["dry_run"] = True with pooled_rapi_client(vm) as client: return client.ModifyInstance(**kwargs)
def _create_network(network, backend): """Create a network.""" network_type = network.public and 'public' or 'private' tags = network.backend_tag if network.dhcp: tags.append('nfdhcpd') if network.public: conflicts_check = True else: conflicts_check = False try: bn = BackendNetwork.objects.get(network=network, backend=backend) mac_prefix = bn.mac_prefix except BackendNetwork.DoesNotExist: raise Exception("BackendNetwork for network '%s' in backend '%s'" " does not exist" % (network.id, backend.id)) with pooled_rapi_client(backend) as client: return client.CreateNetwork(network_name=network.backend_id, network=network.subnet, network6=network.subnet6, gateway=network.gateway, gateway6=network.gateway6, network_type=network_type, mac_prefix=mac_prefix, conflicts_check=conflicts_check, tags=tags)
def stale_servers_in_db(D, G): idD = set(D.keys()) idG = set(G.keys()) stale = set() for i in idD - idG: if D[i] == 'BUILD': vm = VirtualMachine.objects.get(id=i) if needs_reconciliation(vm): with pooled_rapi_client(vm) as c: try: job_status = c.GetJobStatus(vm.backendjobid)['status'] if job_status in ('queued', 'waiting', 'running'): # Server is still building in Ganeti continue else: c.GetInstance(utils.id_to_instance_name(i)) # Server has just been created in Ganeti continue except GanetiApiError: stale.add(i) else: stale.add(i) return stale
def job_is_still_running(vm): with pooled_rapi_client(vm) as c: try: job_info = c.GetJobStatus(vm.backendjobid) return not (job_info["status"] in JOB_STATUS_FINALIZED) except GanetiApiError: return False
def set_firewall_profile(vm, profile): try: tag = _firewall_tags[profile] except KeyError: raise ValueError("Unsopported Firewall Profile: %s" % profile) try: public_nic = vm.nics.filter(network__public=True)[0] except IndexError: public_nic = None log.debug("Setting tag of VM %s to %s", vm, profile) with pooled_rapi_client(vm) as client: # Delete all firewall tags for t in _firewall_tags.values(): client.DeleteInstanceTags(vm.backend_vm_id, [t], dry_run=settings.TEST) if public_nic is not None: tag_with_name = t.replace("0", public_nic.backend_uuid) client.DeleteInstanceTags(vm.backend_vm_id, [tag_with_name], dry_run=settings.TEST) client.AddInstanceTags(vm.backend_vm_id, [tag], dry_run=settings.TEST) if public_nic is not None: tag_with_name = tag.replace("0", public_nic.backend_uuid) client.AddInstanceTags(vm.backend_vm_id, [tag_with_name], dry_run=settings.TEST) # XXX NOP ModifyInstance call to force process_net_status to run # on the dispatcher os_name = settings.GANETI_CREATEINSTANCE_KWARGS['os'] client.ModifyInstance(vm.backend_vm_id, os_name=os_name)
def update_db(vm, msg, event_time): """Process a notification of type 'ganeti-op-status'""" log.debug("Processing ganeti-op-status msg: %s", msg) if msg['type'] != "ganeti-op-status": log.error("Message is of unknown type %s.", msg['type']) return operation = msg["operation"] status = msg["status"] jobID = msg["jobId"] logmsg = msg["logmsg"] nics = msg.get("instance_nics", None) disks = msg.get("instance_disks", None) job_fields = msg.get("job_fields", {}) result = msg.get("result", []) # Special case: OP_INSTANCE_CREATE with opportunistic locking may fail # if all Ganeti nodes are already locked. Retry the job without # opportunistic locking.. if (operation == "OP_INSTANCE_CREATE" and status == "error" and job_fields.get("opportunistic_locking", False)): try: error_code = result[1][1] except IndexError: error_code = None if error_code == rapi.ECODE_TEMP_NORES: if vm.backendjobid != jobID: # The job has already been retried! return # Remove extra fields [job_fields.pop(f, None) for f in ("OP_ID", "reason")] # Remove 'pnode' and 'snode' if they were set by Ganeti iallocator. # Ganeti will fail if both allocator and nodes are specified. allocator = job_fields.pop("iallocator", None) if allocator is not None: [job_fields.pop(f, None) for f in ("pnode", "snode")] name = job_fields.pop("name", job_fields.pop("instance_name", None)) # Turn off opportunistic locking before retrying the job job_fields["opportunistic_locking"] = False with pooled_rapi_client(vm) as c: jobID = c.CreateInstance(name=name, **job_fields) # Update the VM fields vm.backendjobid = jobID # Update the task_job_id for commissions vm.task_job_id = jobID vm.backendjobstatus = None vm.save() log.info("Retrying failed creation of instance '%s' without" " opportunistic locking. New job ID: '%s'", name, jobID) return backend_mod.process_op_status(vm, event_time, jobID, operation, status, logmsg, nics=nics, disks=disks, job_fields=job_fields) log.debug("Done processing ganeti-op-status msg for vm %s.", msg['instance'])
def connect_to_network(vm, nic): backend = vm.backend network = nic.network address = nic.ipv4 network = Network.objects.select_for_update().get(id=network.id) bnet, created = BackendNetwork.objects.get_or_create(backend=backend, network=network) depend_jobs = [] if bnet.operstate != "ACTIVE": depend_jobs = create_network(network, backend, connect=True) depends = [[job, ["success", "error", "canceled"]] for job in depend_jobs] nic = {'ip': address, 'network': network.backend_id, 'name': nic.backend_uuid} log.debug("Connecting vm %s to network %s(%s)", vm, network, address) kwargs = { "nics": [('add', "-1", nic)], "depends": depends, "dry_run": settings.TEST } if vm.backend.use_hotplug(): kwargs["hotplug_if_possible"] = True with pooled_rapi_client(vm) as client: return client.ModifyInstance(vm.backend_vm_id, **kwargs)
def set_firewall_profile(vm, profile, nic): uuid = nic.backend_uuid try: tag = _firewall_tags[profile] % uuid except KeyError: raise ValueError("Unsopported Firewall Profile: %s" % profile) log.debug("Setting tag of VM %s, NIC %s, to %s", vm, nic, profile) with pooled_rapi_client(vm) as client: # Delete previous firewall tags old_tags = client.GetInstanceTags(vm.backend_vm_id) delete_tags = [(t % uuid) for t in _firewall_tags.values() if (t % uuid) in old_tags] if delete_tags: client.DeleteInstanceTags(vm.backend_vm_id, delete_tags, dry_run=settings.TEST) if profile != "DISABLED": client.AddInstanceTags(vm.backend_vm_id, [tag], dry_run=settings.TEST) # XXX NOP ModifyInstance call to force process_net_status to run # on the dispatcher os_name = settings.GANETI_CREATEINSTANCE_KWARGS['os'] client.ModifyInstance(vm.backend_vm_id, os_name=os_name) return None
def create_instance(vm, public_nic, flavor, image): """`image` is a dictionary which should contain the keys: 'backend_id', 'format' and 'metadata' metadata value should be a dictionary. """ # Handle arguments to CreateInstance() as a dictionary, # initialize it based on a deployment-specific value. # This enables the administrator to override deployment-specific # arguments, such as the disk template to use, name of os provider # and hypervisor-specific parameters at will (see Synnefo #785, #835). # kw = vm.backend.get_create_params() kw['mode'] = 'create' kw['name'] = vm.backend_vm_id # Defined in settings.GANETI_CREATEINSTANCE_KWARGS kw['disk_template'] = flavor.disk_template kw['disks'] = [{"size": flavor.disk * 1024}] provider = flavor.disk_provider if provider: kw['disks'][0]['provider'] = provider kw['disks'][0]['origin'] = flavor.disk_origin kw['nics'] = [{"name": public_nic.backend_uuid, "network": public_nic.network.backend_id, "ip": public_nic.ipv4}] # Defined in settings.GANETI_CREATEINSTANCE_KWARGS # kw['os'] = settings.GANETI_OS_PROVIDER kw['ip_check'] = False kw['name_check'] = False # Do not specific a node explicitly, have # Ganeti use an iallocator instead #kw['pnode'] = rapi.GetNodes()[0] kw['dry_run'] = settings.TEST kw['beparams'] = { 'auto_balance': True, 'vcpus': flavor.cpu, 'memory': flavor.ram} kw['osparams'] = { 'config_url': vm.config_url, # Store image id and format to Ganeti 'img_id': image['backend_id'], 'img_format': image['format']} # Use opportunistic locking kw['opportunistic_locking'] = settings.GANETI_USE_OPPORTUNISTIC_LOCKING # Defined in settings.GANETI_CREATEINSTANCE_KWARGS # kw['hvparams'] = dict(serial_console=False) log.debug("Creating instance %s", utils.hide_pass(kw)) with pooled_rapi_client(vm) as client: return client.CreateInstance(**kw)
def disconnect_from_network(vm, nic): op = [('remove', nic.index, {})] log.debug("Removing nic of VM %s, with index %s", vm, str(nic.index)) with pooled_rapi_client(vm) as client: return client.ModifyInstance(vm.backend_vm_id, nics=op, hotplug=vm.backend.use_hotplug(), dry_run=settings.TEST)
def job_is_still_running(vm, job_id=None): with pooled_rapi_client(vm) as c: try: if job_id is None: job_id = vm.backendjobid job_info = c.GetJobStatus(job_id) return not (job_info["status"] in rapi.JOB_STATUS_FINALIZED) except rapi.GanetiApiError: return False
def connect_network_synced(network, backend): with pooled_rapi_client(backend) as client: for group in client.GetGroups(): job = client.ConnectNetwork(network.backend_id, group, network.mode, network.link) result = wait_for_job(client, job) if result[0] != rapi.JOB_STATUS_SUCCESS: return result return result
def disconnect_network(network, backend, group=None): log.debug("Disconnecting network %s to backend %s", network, backend) with pooled_rapi_client(backend) as client: groups = [group] if group is not None else client.GetGroups() job_ids = [] for group in groups: job_id = client.DisconnectNetwork(network.backend_id, group) job_ids.append(job_id) return job_ids
def get_networks_from_ganeti(backend): prefix = settings.BACKEND_PREFIX_ID + 'net-' networks = {} with pooled_rapi_client(backend) as c: for net in c.GetNetworks(bulk=True): if net['name'].startswith(prefix): id = utils.id_from_network_name(net['name']) networks[id] = net return networks
def get_memory_from_instances(backend): """ Get the memory that is used from instances. Get the used memory of a backend. Note: This is different for the real memory used, due to kvm's memory de-duplication. """ with pooled_rapi_client(backend) as client: instances = client.GetInstances(bulk=True) mem = 0 for i in instances: mem += i['oper_ram'] return mem
def _create_network(network, backend): """Create a network.""" tags = network.backend_tag subnet = None subnet6 = None gateway = None gateway6 = None for _subnet in network.subnets.all(): if _subnet.dhcp and not "nfdhcpd" in tags: tags.append("nfdhcpd") if _subnet.ipversion == 4: subnet = _subnet.cidr gateway = _subnet.gateway elif _subnet.ipversion == 6: subnet6 = _subnet.cidr gateway6 = _subnet.gateway conflicts_check = False if network.public: tags.append('public') if subnet is not None: conflicts_check = True else: tags.append('private') # Use a dummy network subnet for IPv6 only networks. Currently Ganeti does # not support IPv6 only networks. To bypass this limitation, we create the # network with a dummy network subnet, and make Cyclades connect instances # to such networks, with address=None. if subnet is None: subnet = "10.0.0.0/29" try: bn = BackendNetwork.objects.get(network=network, backend=backend) mac_prefix = bn.mac_prefix except BackendNetwork.DoesNotExist: raise Exception("BackendNetwork for network '%s' in backend '%s'" " does not exist" % (network.id, backend.id)) with pooled_rapi_client(backend) as client: return client.CreateNetwork(network_name=network.backend_id, network=subnet, network6=subnet6, gateway=gateway, gateway6=gateway6, mac_prefix=mac_prefix, conflicts_check=conflicts_check, tags=tags)
def disconnect_from_network(vm, nic): op = [('remove', str(nic.index), {})] log.debug("Removing nic of VM %s, with index %s", vm, str(nic.index)) kwargs = { "nics": op, "dry_run": settings.TEST } if vm.backend.use_hotplug(): kwargs["hotplug_if_possible"] = True with pooled_rapi_client(vm) as client: return client.ModifyInstance(vm.backend_vm_id, **kwargs)
def handle(self, *args, **options): if len(args) != 1: raise CommandError("Please provide a network ID") network = get_network(args[0]) # Validate subnet if options.get('subnet'): validate_network_info(options) # Validate state state = options.get('state') if state: allowed = [x[0] for x in Network.OPER_STATES] if state not in allowed: msg = "Invalid state, must be one of %s" % ', '.join(allowed) raise CommandError(msg) dhcp = options.get("dhcp") if dhcp: options["dhcp"] = parse_bool(dhcp) drained = options.get("drained") if drained: options["drained"] = parse_bool(drained) fields = ('name', 'userid', 'subnet', 'gateway', 'subnet6', 'gateway6', 'dhcp', 'state', 'link', 'mac_prefix', 'drained') for field in fields: value = options.get(field, None) if value is not None: network.__setattr__(field, value) add_reserved_ips = options.get('add_reserved_ips') remove_reserved_ips = options.get('remove_reserved_ips') if add_reserved_ips or remove_reserved_ips: if add_reserved_ips: add_reserved_ips = add_reserved_ips.split(",") if remove_reserved_ips: remove_reserved_ips = remove_reserved_ips.split(",") for bnetwork in network.backend_networks.all(): with pooled_rapi_client(bnetwork.backend) as c: c.ModifyNetwork(network=network.backend_id, add_reserved_ips=add_reserved_ips, remove_reserved_ips=remove_reserved_ips) network.save()
def get_available_disk_templates(backend): """Get the list of available disk templates of a Ganeti backend. The list contains the disk templates that are enabled in the Ganeti backend and also included in ipolicy-disk-templates. """ with pooled_rapi_client(backend) as c: info = c.GetInfo() ipolicy_disk_templates = info["ipolicy"]["disk-templates"] try: enabled_disk_templates = info["enabled_disk_templates"] return [dp for dp in enabled_disk_templates if dp in ipolicy_disk_templates] except KeyError: # Ganeti < 2.8 does not have 'enabled_disk_templates' return ipolicy_disk_templates
def reboot_instance(vm, reboot_type, shutdown_timeout=None): assert reboot_type in ('soft', 'hard') # Note that reboot type of Ganeti job must be always hard. The 'soft' and # 'hard' type of OS API is different from the one in Ganeti, and maps to # 'shutdown_timeout'. kwargs = {"instance": vm.backend_vm_id, "reboot_type": "hard"} # 'shutdown_timeout' parameter is only support from snf-ganeti>=2.8.2 and # Ganeti > 2.10. In other versions this parameter will be ignored and # we will fallback to default timeout of Ganeti (120s). if shutdown_timeout is not None: kwargs["shutdown_timeout"] = shutdown_timeout if reboot_type == "hard": kwargs["shutdown_timeout"] = 0 if settings.TEST: kwargs["dry_run"] = True with pooled_rapi_client(vm) as client: return client.RebootInstance(**kwargs)
def get_available_disk_templates(backend): """Get the list of available disk templates of a Ganeti backend. The list contains the disk templates that are enabled in the Ganeti backend and also included in ipolicy-disk-templates. """ with pooled_rapi_client(backend) as c: info = c.GetInfo() ipolicy_disk_templates = info["ipolicy"]["disk-templates"] try: enabled_disk_templates = info["enabled_disk_templates"] return [ dp for dp in enabled_disk_templates if dp in ipolicy_disk_templates ] except KeyError: # Ganeti < 2.8 does not have 'enabled_disk_templates' return ipolicy_disk_templates
def hanging_networks(backend, GNets): """Get networks that are not connected to all Nodegroups. """ def get_network_groups(group_list): groups = set() for (name, mode, link) in group_list: groups.add(name) return groups with pooled_rapi_client(backend) as c: groups = set(c.GetGroups()) hanging = {} for id, info in GNets.items(): group_list = get_network_groups(info['group_list']) if group_list != groups: hanging[id] = groups - group_list return hanging
def connect_network(network, backend, depends=[], group=None): """Connect a network to nodegroups.""" log.debug("Connecting network %s to backend %s", network, backend) conflicts_check = False if network.public and (network.subnet4 is not None): conflicts_check = True depends = create_job_dependencies(depends) with pooled_rapi_client(backend) as client: groups = [group] if group is not None else client.GetGroups() job_ids = [] for group in groups: job_id = client.ConnectNetwork(network.backend_id, group, network.mode, network.link, conflicts_check, depends=depends) job_ids.append(job_id) return job_ids
def pprint_network_in_ganeti(network, stdout=None): if stdout is None: stdout = sys.stdout for backend in Backend.objects.exclude(offline=True): with pooled_rapi_client(backend) as client: try: g_net = client.GetNetwork(network.backend_id) ip_map = g_net.pop("map", {}) pprint_table(stdout, g_net.items(), None, title="State of network in backend: %s" % backend.clustername) if network.subnet4 is not None: pprint_pool(None, ip_map, 80, stdout) except GanetiApiError as e: if e.code == 404: stdout.write('Network does not exist in backend %s\n' % backend.clustername) else: raise e
def connect_to_network(vm, network, address=None): backend = vm.backend network = Network.objects.select_for_update().get(id=network.id) bnet, created = BackendNetwork.objects.get_or_create(backend=backend, network=network) depend_jobs = [] if bnet.operstate != "ACTIVE": depend_jobs = create_network(network, backend, connect=True) depends = [[job, ["success", "error", "canceled"]] for job in depend_jobs] nic = {'ip': address, 'network': network.backend_id} log.debug("Connecting vm %s to network %s(%s)", vm, network, address) with pooled_rapi_client(vm) as client: return client.ModifyInstance(vm.backend_vm_id, nics=[('add', nic)], hotplug=vm.backend.use_hotplug(), depends=depends, dry_run=settings.TEST)
def connect_network(network, backend, depends=[], group=None): """Connect a network to nodegroups.""" log.debug("Connecting network %s to backend %s", network, backend) if network.public: conflicts_check = True else: conflicts_check = False depends = [[job, ["success", "error", "canceled"]] for job in depends] with pooled_rapi_client(backend) as client: groups = [group] if group is not None else client.GetGroups() job_ids = [] for group in groups: job_id = client.ConnectNetwork(network.backend_id, group, network.mode, network.link, conflicts_check, depends=depends) job_ids.append(job_id) return job_ids
def disconnect_from_network(vm, nic): log.debug("Removing NIC %s of VM %s", nic, vm) kwargs = { "instance": vm.backend_vm_id, "nics": [("remove", nic.backend_uuid, {})], } if vm.backend.use_hotplug(): kwargs["hotplug_if_possible"] = True if settings.TEST: kwargs["dry_run"] = True with pooled_rapi_client(vm) as client: jobID = client.ModifyInstance(**kwargs) firewall_profile = nic.firewall_profile if firewall_profile and firewall_profile != "DISABLED": tag = _firewall_tags[firewall_profile] % nic.backend_uuid client.DeleteInstanceTags(vm.backend_vm_id, [tag], dry_run=settings.TEST) return jobID
def set_firewall_profile(vm, profile): try: tag = _firewall_tags[profile] except KeyError: raise ValueError("Unsopported Firewall Profile: %s" % profile) log.debug("Setting tag of VM %s to %s", vm, profile) with pooled_rapi_client(vm) as client: # Delete all firewall tags for t in _firewall_tags.values(): client.DeleteInstanceTags(vm.backend_vm_id, [t], dry_run=settings.TEST) client.AddInstanceTags(vm.backend_vm_id, [tag], dry_run=settings.TEST) # XXX NOP ModifyInstance call to force process_net_status to run # on the dispatcher os_name = settings.GANETI_CREATEINSTANCE_KWARGS['os'] client.ModifyInstance(vm.backend_vm_id, os_name=os_name)
def pprint_network_in_ganeti(network, stdout=None): if stdout is None: stdout = sys.stdout for backend in Backend.objects.exclude(offline=True): with pooled_rapi_client(backend) as client: try: g_net = client.GetNetwork(network.backend_id) ip_map = g_net.pop("map") pprint_table(stdout, g_net.items(), None, title="State of network in backend: %s" % backend.clustername) if network.subnet4 is not None: pprint_pool(None, ip_map, 80, stdout) except GanetiApiError as e: if e.code == 404: stdout.write('Network does not exist in backend %s\n' % backend.clustername) else: raise e
def hanging_networks(backend, GNets): """Get networks that are not connected to all Nodegroups. """ def get_network_groups(group_list): groups = set() # Since ganeti 2.10 networks are connected to nodegroups # with mode and link AND vlan (ovs extra nicparam) for group_info in group_list: groups.add(group_info[0]) return groups with pooled_rapi_client(backend) as c: groups = set(c.GetGroups()) hanging = {} for id, info in GNets.items(): group_list = get_network_groups(info['group_list']) if group_list != groups: hanging[id] = groups - group_list return hanging
def unsynced_operstate(D, G): unsynced = set() idD = set(D.keys()) idG = set(G.keys()) for i in idD & idG: vm_unsynced = (G[i] and D[i] != "STARTED") or\ (not G[i] and D[i] not in ('BUILD', 'ERROR', 'STOPPED')) if vm_unsynced: unsynced.add((i, D[i], G[i])) if not G[i] and D[i] == 'BUILD': vm = VirtualMachine.objects.get(id=i) if needs_reconciliation(vm): with pooled_rapi_client(vm) as c: try: job_info = c.GetJobStatus(job_id=vm.backendjobid) if job_info['status'] == 'success': unsynced.add((i, D[i], G[i])) except GanetiApiError: pass return unsynced
def _get_cluster_stats(bend): """Get information about a Ganeti cluster and all of it's nodes.""" bend_vms = bend.virtual_machines.filter(deleted=False) vm_stats = bend_vms.aggregate(Sum("flavor__cpu"), Sum("flavor__ram"), Sum("flavor__disk")) cluster_info = { "drained": bend.drained, "offline": bend.offline, "hypervisor": bend.hypervisor, "disk_templates": bend.disk_templates, "virtual_servers": bend_vms.count(), "virtual_cpu": (vm_stats["flavor__cpu__sum"] or 0), "virtual_ram": (vm_stats["flavor__ram__sum"] or 0) << 20, "virtual_disk": (vm_stats["flavor__disk__sum"] or 0) << 30, "nodes": {}, } nodes = [] if not bend.offline: with pooled_rapi_client(bend) as c: nodes = c.GetNodes(bulk=True) for node in nodes: _node_stats = { "drained": node["drained"], "offline": node["offline"], "vm_capable": node["vm_capable"], "instances": node["pinst_cnt"], "cpu": (node["ctotal"] or 0), "ram": { "total": (node["mtotal"] or 0) << 20, "free": (node["mfree"] or 0) << 20 }, "disk": { "total": (node["dtotal"] or 0) << 20, "free": (node["dfree"] or 0) << 20 }, } cluster_info["nodes"][node["name"]] = _node_stats return bend.clustername, cluster_info
def instances_with_build_errors(D, G): failed = set() idD = set(D.keys()) idG = set(G.keys()) for i in idD & idG: if not G[i] and D[i] == 'BUILD': vm = VirtualMachine.objects.get(id=i) if not vm.backendjobid: # VM has not been enqueued in the backend if datetime.now() > vm.created + timedelta(seconds=120): # If a job has not been enqueued after 2 minutues, then # it must be a stale entry.. failed.add(i) elif needs_reconciliation(vm): # Check time to avoid many rapi calls with pooled_rapi_client(vm) as c: try: job_info = c.GetJobStatus(job_id=vm.backendjobid) if job_info['status'] == 'error': failed.add(i) except GanetiApiError: failed.add(i) return failed
def _delete_network(network, backend, depends=[]): depends = create_job_dependencies(depends) with pooled_rapi_client(backend) as client: return client.DeleteNetwork(network.backend_id, depends)
def reboot_instance(vm, reboot_type): assert reboot_type in ('soft', 'hard') with pooled_rapi_client(vm) as client: return client.RebootInstance(vm.backend_vm_id, reboot_type, dry_run=settings.TEST)
def get_network_info(backend_network): with pooled_rapi_client(backend_network) as client: return client.GetNetwork(backend_network.network.backend_id)
def get_instance_info(vm): with pooled_rapi_client(vm) as client: return client.GetInstance(vm.backend_vm_id)
def shutdown_instance(vm, shutdown_timeout=None): with pooled_rapi_client(vm) as client: return client.ShutdownInstance(vm.backend_vm_id, timeout=shutdown_timeout, dry_run=settings.TEST)
def startup_instance(vm): with pooled_rapi_client(vm) as client: return client.StartupInstance(vm.backend_vm_id, dry_run=settings.TEST)
def update_db(vm, msg, event_time): """Process a notification of type 'ganeti-op-status'""" log.debug("Processing ganeti-op-status msg: %s", msg) if msg['type'] != "ganeti-op-status": log.error("Message is of unknown type %s.", msg['type']) return operation = msg["operation"] status = msg["status"] jobID = msg["jobId"] logmsg = msg["logmsg"] nics = msg.get("instance_nics", None) job_fields = msg.get("job_fields", {}) result = msg.get("result", []) # Special case: OP_INSTANCE_CREATE with opportunistic locking may fail # if all Ganeti nodes are already locked. Retry the job without # opportunistic locking.. if (operation == "OP_INSTANCE_CREATE" and status == "error" and job_fields.get("opportunistic_locking", False)): try: error_code = result[1][1] except IndexError: error_code = None if error_code == rapi.ECODE_TEMP_NORES: if vm.backendjobid != jobID: # The job has already been retried! return # Remove extra fields [job_fields.pop(f) for f in ("OP_ID", "reason")] # Remove 'pnode' and 'snode' if they were set by Ganeti iallocator. # Ganeti will fail if both allocator and nodes are specified. allocator = job_fields.pop("iallocator") if allocator is not None: [job_fields.pop(f) for f in ("pnode", "snode")] name = job_fields.pop("name", job_fields.pop("instance_name")) # Turn off opportunistic locking before retrying the job job_fields["opportunistic_locking"] = False with pooled_rapi_client(vm) as c: jobID = c.CreateInstance(name=name, **job_fields) # Update the VM fields vm.backendjobid = jobID # Update the task_job_id for commissions vm.task_job_id = jobID vm.backendjobstatus = None vm.save() log.info( "Retrying failed creation of instance '%s' without" " opportunistic locking. New job ID: '%s'", name, jobID) return backend_mod.process_op_status(vm, event_time, jobID, operation, status, logmsg, nics=nics, job_fields=job_fields) log.debug("Done processing ganeti-op-status msg for vm %s.", msg['instance'])
def _create_network_synced(network, backend): with pooled_rapi_client(backend) as client: job = _create_network(network, backend) result = wait_for_job(client, job) return result
def delete_instance(vm): with pooled_rapi_client(vm) as client: return client.DeleteInstance(vm.backend_vm_id, dry_run=settings.TEST)
def get_jobs(backend, bulk=True): with pooled_rapi_client(backend) as c: return c.GetJobs(bulk=bulk)
def get_instances(backend, bulk=True): with pooled_rapi_client(backend) as c: return c.GetInstances(bulk=bulk)
def handle_request(client, msg): """Callback function for handling requests. Currently only 'status-check' action is supported. """ client.basic_ack(msg) log.debug("Received request message: %s", msg) body = json.loads(msg["body"]) reply_to = None try: reply_to = body["reply_to"] reply_to = reply_to.encode("utf-8") action = body["action"] assert (action == "status-check") except (KeyError, AssertionError) as e: log.warning("Invalid request message: %s. Error: %s", msg, e) if reply_to is not None: msg = {"status": "failed", "reason": "Invalid request"} client.basic_publish("", reply_to, json.dumps(msg)) return msg = {"action": action, "status": "started"} client.basic_publish("", reply_to, json.dumps(msg)) # Declare 'heartbeat' queue and bind it to the exchange. The queue is # declared with a 'ttl' option in order to be automatically deleted. hostname, pid = get_hostname(), os.getpid() queue = queues.get_dispatcher_heartbeat_queue(hostname, pid) exchange = settings.EXCHANGE_GANETI routing_key = queues.EVENTD_HEARTBEAT_ROUTING_KEY client.queue_declare(queue=queue, mirrored=False, ttl=HEARTBEAT_QUEUE_TTL) client.queue_bind(queue=queue, exchange=exchange, routing_key=routing_key) log.debug("Binding %s(%s) to queue %s", exchange, routing_key, queue) backends = Backend.objects.filter(offline=False) status = {} _OK = "ok" _FAIL = "fail" # Add cluster tag to trigger snf-ganeti-eventd tag = "snf:eventd:heartbeat:%s:%s" % (hostname, pid) for backend in backends: cluster = backend.clustername status[cluster] = {"RAPI": _FAIL, "eventd": _FAIL} try: with pooled_rapi_client(backend) as rapi: rapi.AddClusterTags(tags=[tag], dry_run=True) except: log.exception("Failed to send job to Ganeti cluster '%s' during" " status check" % cluster) continue status[cluster]["RAPI"] = _OK start = time.time() while time.time() - start <= HEARTBEAT_TIMEOUT: msg = client.basic_get(queue, no_ack=True) if msg is None: time.sleep(0.1) continue log.debug("Received heartbeat msg: %s", msg) try: body = json.loads(msg["body"]) cluster = body["cluster"] status[cluster]["eventd"] = _OK except: log.error("Received invalid heartbat msg: %s", msg) if not filter(lambda x: x["eventd"] == _FAIL, status.values()): break # Send back status report client.basic_publish("", reply_to, json.dumps({"status": status}))
def _delete_network(network, backend, depends=[]): depends = [[job, ["success", "error", "canceled"]] for job in depends] with pooled_rapi_client(backend) as client: return client.DeleteNetwork(network.backend_id, depends)
def get_ganeti_jobs(backend=None, bulk=False): jobs = [] for backend in get_backends(backend): with pooled_rapi_client(backend) as client: jobs.append(client.GetJobs(bulk=bulk)) return reduce(list.__add__, jobs, [])
def create_instance(vm, nics, flavor, image): """`image` is a dictionary which should contain the keys: 'backend_id', 'format' and 'metadata' metadata value should be a dictionary. """ # Handle arguments to CreateInstance() as a dictionary, # initialize it based on a deployment-specific value. # This enables the administrator to override deployment-specific # arguments, such as the disk template to use, name of os provider # and hypervisor-specific parameters at will (see Synnefo #785, #835). # kw = vm.backend.get_create_params() kw['mode'] = 'create' kw['name'] = vm.backend_vm_id # Defined in settings.GANETI_CREATEINSTANCE_KWARGS kw['disk_template'] = flavor.disk_template kw['disks'] = [{"size": flavor.disk * 1024}] provider = flavor.disk_provider if provider: kw['disks'][0]['provider'] = provider kw['disks'][0]['origin'] = flavor.disk_origin extra_disk_params = settings.GANETI_DISK_PROVIDER_KWARGS.get(provider) if extra_disk_params is not None: kw["disks"][0].update(extra_disk_params) kw['nics'] = [{ "name": nic.backend_uuid, "network": nic.network.backend_id, "ip": nic.ipv4_address } for nic in nics] backend = vm.backend depend_jobs = [] for nic in nics: bnet, job_ids = ensure_network_is_active(backend, nic.network_id) depend_jobs.extend(job_ids) kw["depends"] = create_job_dependencies(depend_jobs) # Defined in settings.GANETI_CREATEINSTANCE_KWARGS # kw['os'] = settings.GANETI_OS_PROVIDER kw['ip_check'] = False kw['name_check'] = False # Do not specific a node explicitly, have # Ganeti use an iallocator instead #kw['pnode'] = rapi.GetNodes()[0] kw['dry_run'] = settings.TEST kw['beparams'] = { 'auto_balance': True, 'vcpus': flavor.cpu, 'memory': flavor.ram } kw['osparams'] = { 'config_url': vm.config_url, # Store image id and format to Ganeti 'img_id': image['backend_id'], 'img_format': image['format'] } # Use opportunistic locking kw['opportunistic_locking'] = settings.GANETI_USE_OPPORTUNISTIC_LOCKING # Defined in settings.GANETI_CREATEINSTANCE_KWARGS # kw['hvparams'] = dict(serial_console=False) log.debug("Creating instance %s", utils.hide_pass(kw)) with pooled_rapi_client(vm) as client: return client.CreateInstance(**kw)
def handle(self, **options): verbosity = int(options['verbosity']) self._process_args(options) backend_id = options['backend-id'] backend = get_backend(backend_id) if backend_id else None G, GNics = reconciliation.get_instances_from_ganeti(backend) D = reconciliation.get_servers_from_db(backend) DBNics = reconciliation.get_nics_from_db(backend) # # Detect problems # if options['detect_stale']: stale = reconciliation.stale_servers_in_db(D, G) if len(stale) > 0: print >> sys.stderr, "Found the following stale server IDs: " print " " + "\n ".join([str(x) for x in stale]) elif verbosity == 2: print >> sys.stderr, "Found no stale server IDs in DB." if options['detect_orphans']: orphans = reconciliation.orphan_instances_in_ganeti(D, G) if len(orphans) > 0: print >> sys.stderr, "Found orphan Ganeti instances with IDs: " print " " + "\n ".join([str(x) for x in orphans]) elif verbosity == 2: print >> sys.stderr, "Found no orphan Ganeti instances." if options['detect_unsynced']: unsynced = reconciliation.unsynced_operstate(D, G) if len(unsynced) > 0: print >> sys.stderr, "The operstate of the following server" \ " IDs is out-of-sync:" print " " + "\n ".join([ "%d is %s in DB, %s in Ganeti" % (x[0], x[1], ('UP' if x[2] else 'DOWN')) for x in unsynced ]) elif verbosity == 2: print >> sys.stderr, "The operstate of all servers is in sync." if options['detect_build_errors']: build_errors = reconciliation.instances_with_build_errors(D, G) if len(build_errors) > 0: msg = "The os for the following server IDs was not build"\ " successfully:" print >> sys.stderr, msg print " " + "\n ".join(["%d" % x for x in build_errors]) elif verbosity == 2: print >> sys.stderr, "Found no instances with build errors." if options['detect_unsynced_nics']: def pretty_print_nics(nics): if not nics: print ''.ljust(18) + 'None' for index, info in nics.items(): print ''.ljust(18) + 'nic/' + str(index) +\ ': MAC: %s, IP: %s, Network: %s' % \ (info['mac'], info['ipv4'], info['network']) unsynced_nics = reconciliation.unsynced_nics(DBNics, GNics) if len(unsynced_nics) > 0: msg = "The NICs of the servers with the following IDs are"\ " unsynced:" print >> sys.stderr, msg for id, nics in unsynced_nics.items(): print ''.ljust(2) + '%6d:' % id print ''.ljust(8) + '%8s:' % 'DB' pretty_print_nics(nics[0]) print ''.ljust(8) + '%8s:' % 'Ganeti' pretty_print_nics(nics[1]) elif verbosity == 2: print >> sys.stderr, "All instance nics are synced." # # Then fix them # if options['fix_stale'] and len(stale) > 0: print >> sys.stderr, \ "Simulating successful Ganeti removal for %d " \ "servers in the DB:" % len(stale) for vm in VirtualMachine.objects.filter(pk__in=stale): event_time = datetime.datetime.now() backend_mod.process_op_status( vm=vm, etime=event_time, jobid=-0, opcode='OP_INSTANCE_REMOVE', status='success', logmsg='Reconciliation: simulated Ganeti event') print >> sys.stderr, " ...done" if options['fix_orphans'] and len(orphans) > 0: print >> sys.stderr, \ "Issuing OP_INSTANCE_REMOVE for %d Ganeti instances:" % \ len(orphans) for id in orphans: try: vm = VirtualMachine.objects.get(pk=id) with pooled_rapi_client(vm) as client: client.DeleteInstance(utils.id_to_instance_name(id)) except VirtualMachine.DoesNotExist: print >> sys.stderr, "No entry for VM %d in DB !!" % id print >> sys.stderr, " ...done" if options['fix_unsynced'] and len(unsynced) > 0: print >> sys.stderr, "Setting the state of %d out-of-sync VMs:" % \ len(unsynced) for id, db_state, ganeti_up in unsynced: vm = VirtualMachine.objects.get(pk=id) opcode = "OP_INSTANCE_REBOOT" if ganeti_up \ else "OP_INSTANCE_SHUTDOWN" event_time = datetime.datetime.now() backend_mod.process_op_status( vm=vm, etime=event_time, jobid=-0, opcode=opcode, status='success', logmsg='Reconciliation: simulated Ganeti event') print >> sys.stderr, " ...done" if options['fix_build_errors'] and len(build_errors) > 0: print >> sys.stderr, "Setting the state of %d build-errors VMs:" %\ len(build_errors) for id in build_errors: vm = VirtualMachine.objects.get(pk=id) event_time = datetime.datetime.now() backend_mod.process_op_status( vm=vm, etime=event_time, jobid=-0, opcode="OP_INSTANCE_CREATE", status='error', logmsg='Reconciliation: simulated Ganeti event') print >> sys.stderr, " ...done" if options['fix_unsynced_nics'] and len(unsynced_nics) > 0: print >> sys.stderr, "Setting the nics of %d out-of-sync VMs:" % \ len(unsynced_nics) for id, nics in unsynced_nics.items(): vm = VirtualMachine.objects.get(pk=id) nics = nics[1] # Ganeti nics if nics == {}: # No nics vm.nics.all.delete() continue for index, nic in nics.items(): net_id = utils.id_from_network_name(nic['network']) subnet6 = Network.objects.get(id=net_id).subnet6 # Produce ipv6 ipv6 = subnet6 and mac2eui64(nic['mac'], subnet6) or None nic['ipv6'] = ipv6 # Rename ipv4 to ip nic['ip'] = nic['ipv4'] # Dict to sorted list final_nics = [] nics_keys = nics.keys() nics_keys.sort() for i in nics_keys: if nics[i]['network']: final_nics.append(nics[i]) else: print 'Network of nic %d of vm %s is None. ' \ 'Can not reconcile' % (i, vm.backend_vm_id) event_time = datetime.datetime.now() backend_mod.process_net_status(vm=vm, etime=event_time, nics=final_nics) print >> sys.stderr, " ...done"
def console(vm, console_type): """Arrange for an OOB console of the specified type This method arranges for an OOB console of the specified type. Only consoles of type "vnc" are supported for now. It uses a running instance of vncauthproxy to setup proper VNC forwarding with a random password, then returns the necessary VNC connection info to the caller. """ log.info("Get console VM %s, type %s", vm, console_type) if vm.operstate != "STARTED": raise faults.BadRequest('Server not in ACTIVE state.') # Use RAPI to get VNC console information for this instance # RAPI GetInstanceConsole() returns endpoints to the vnc_bind_address, # which is a cluster-wide setting, either 0.0.0.0 or 127.0.0.1, and pretty # useless (see #783). # # Until this is fixed on the Ganeti side, construct a console info reply # directly. # # WARNING: This assumes that VNC runs on port network_port on # the instance's primary node, and is probably # hypervisor-specific. def get_console_data(i): return {"kind": "vnc", "host": i["pnode"], "port": i["network_port"]} with pooled_rapi_client(vm) as c: i = c.GetInstance(vm.backend_vm_id) console_data = get_console_data(i) if vm.backend.hypervisor == "kvm" and i['hvparams']['serial_console']: raise Exception("hv parameter serial_console cannot be true") # Check that the instance is really running if not i["oper_state"]: log.warning("VM '%s' is marked as '%s' in DB while DOWN in Ganeti", vm.id, vm.operstate) # Instance is not running. Mock a shutdown job to sync DB backend.process_op_status(vm, etime=datetime.now(), jobid=0, opcode="OP_INSTANCE_SHUTDOWN", status="success", logmsg="Reconciliation simulated event") raise faults.BadRequest('Server not in ACTIVE state.') # Let vncauthproxy decide on the source port. # The alternative: static allocation, e.g. # sport = console_data['port'] - 1000 sport = 0 daddr = console_data['host'] dport = console_data['port'] password = util.random_password() vnc_extra_opts = settings.CYCLADES_VNCAUTHPROXY_OPTS # Maintain backwards compatibility with the dict setting if isinstance(vnc_extra_opts, list): vnc_extra_opts = choice(vnc_extra_opts) fwd = request_vnc_forwarding(sport, daddr, dport, password, console_type=console_type, **vnc_extra_opts) if fwd['status'] != "OK": log.error("vncauthproxy returned error status: '%s'" % fwd) raise faults.ServiceUnavailable('vncauthproxy returned error status') # Verify that the VNC server settings haven't changed with pooled_rapi_client(vm) as c: i = c.GetInstance(vm.backend_vm_id) if get_console_data(i) != console_data: raise faults.ServiceUnavailable('VNC Server settings changed.') try: host = fwd['proxy_address'] except KeyError: host = getfqdn() console = { 'type': console_type, 'host': host, 'port': fwd['source_port'], 'password': password } return console