def images(request, hostname): """ Redirect to list of images on default node storage. """ node = get_node(request, hostname) nss = node.nodestorage_set.all().values_list('zpool', flat=True).order_by('zpool') if nss: nz = node.zpool if nz and nz in nss: zpool_redirect = nz elif settings.VMS_STORAGE_DEFAULT in nss: zpool_redirect = settings.VMS_STORAGE_DEFAULT else: zpool_redirect = nss[0] return redirect('node_images_zpool', hostname, zpool_redirect) context = collect_view_data(request, 'node_list') context['nodes'] = Node.all() context['node'] = node return render(request, 'gui/node/images_disabled.html', context)
def images_zpool(request, hostname, zpool): """ List of images on node storages. """ context = collect_view_data(request, 'node_list') context['node'] = node = get_node(request, hostname) context['nodes'] = Node.all() try: context['ns'] = ns = NodeStorage.objects.select_related('storage').get(node=node, zpool=zpool) except NodeStorage.DoesNotExist: raise Http404 context['storages'] = node.nodestorage_set.select_related('storage').all().order_by('zpool').\ annotate(imgs=Count('images__uuid')) context['images'] = ns.images.select_related('owner', 'dc_bound').all().order_by('name').annotate(dcs=Count('dc')) image_vms = {} for vm in node.vm_set.select_related('dc').all().order_by('hostname'): for img_uuid in vm.get_image_uuids(zpool=zpool): image_vms.setdefault(img_uuid, []).append({'hostname': vm.hostname, 'dc': vm.dc.name}) context['image_vms'] = image_vms context['form'] = NodeStorageImageForm(ns, initial={'node': hostname, 'zpool': zpool}) context['last_img'] = request.GET.get('last_img', None) return render(request, 'gui/node/images.html', context)
def storages(request, hostname): """ List of node storages. """ context = collect_view_data(request, 'node_list') context['node'] = node = get_node(request, hostname) context['nodes'] = Node.all() context['zpools'] = node.zpools.keys() context['zpools_missing'] = [] context['storages'] = node.nodestorage_set.select_related('storage', 'storage__owner').order_by('zpool')\ .annotate(Count('dc', distinct=True)) context['form'] = NodeStorageForm(request, node, None, initial={ 'node': node.hostname, 'owner': request.user.username, 'access': Storage.PUBLIC, 'size_coef': Storage.SIZE_COEF, }) for ns in context['storages']: try: context['zpools'].remove(ns.zpool) except ValueError: context['zpools_missing'].append( ns.zpool) # zpool vanished from node return render(request, 'gui/node/storages.html', context)
def details(request, hostname): """ Compute node details. """ dc1_settings = get_dc1_settings(request) context = collect_view_data(request, 'node_list') context['node'] = node = get_node(request, hostname, sr=('owner', )) context['nodes'] = Node.all() context['node_dcs'] = node.dc.all().values_list('alias', flat=True) context['node_vms'] = node.vm_set.count() context['node_real_vms'] = node.vm_set.filter(slavevm__isnull=True).count() context['form'] = NodeForm(request, node, initial=node.web_data) context[ 'mon_sla_enabled'] = settings.MON_ZABBIX_ENABLED and dc1_settings.MON_ZABBIX_NODE_SLA if node.is_backup: context['node_backups'] = node.backup_set.count() else: context['node_backups'] = 0 view_node_details.send(sender='gui.node.views.details', request=request, context=context) return render(request, 'gui/node/details.html', context)
def __init__(self, request, net, *args, **kwargs): super(AdminNetworkForm, self).__init__(request, net, *args, **kwargs) self.fields['owner'].choices = get_owners(request).values_list( 'username', 'username') self.fields['nic_tag'].choices = Node.all_nictags_choices() if not request.user.is_staff: self.fields['dc_bound'].widget.attrs['disabled'] = 'disabled'
def backup_definitions(request, hostname): """ List of server backup definitions targeted onto this node. """ context = collect_view_data(request, 'node_list') context['node'] = node = get_node(request, hostname) context['nodes'] = Node.all() context['bkpdefs'] = get_node_bkpdefs(node) return render(request, 'gui/node/backup_definitions.html', context)
def node_vm_snapshot_sync_all(): """ This is a periodic beat task responsible for syncing node snapshot sizes of all VMs on a compute node. """ for node in Node.all(): if node.is_online() and node.is_compute: try: NodeVmSnapshotList.sync(node) except Exception as exc: logger.exception(exc)
def mon_node_delete(task_id, sender, node_uuid=None, node_hostname=None, log=LOG, **kwargs): """ Remove host from zabbix. """ assert node_uuid # Create dummy node object - used just to get zabbix_id and log things node = Node(uuid=node_uuid, hostname=node_hostname) log.obj = node.log_list return get_monitoring(DefaultDc()).node_delete(node, task_log=log)
def node_status_all(): """ This is a special periodic task, run by Danube Cloud mgmt daemon (que.bootsteps.MgmtDaemon) every minute. It is responsible for running checks on an unreachable compute node. """ for node in Node.all(): if node.is_unreachable(): logger.info('Checking status of unreachable node %s', node) node_worker_status_update.call(node.hostname, queue=Q_FAST, status='unknown') node_check.send('node_status_all', node=node) # Signal!
def node_list(request): """ List of all compute nodes. """ context = collect_view_data(request, 'node_list') context['nodes'] = Node.all() context['node_list'] = get_nodes_extended(request) context['status_form'] = NodeStatusForm(request, None) view_node_list.send(sender='gui.node.views.list', request=request, context=context) return render(request, 'gui/node/list.html', context)
def maintenance(request): """ System maintenance. """ context = collect_view_data(request, 'system_maintenance') context['system'] = call_api_view(request, 'GET', system_version).data.get('result', {}) context['node_list'] = Node.all() context['current_view'] = 'maintenance' context['status_form'] = NodeStatusForm(request, None) context['update_form'] = UpdateForm(request, None) context['node_update_form'] = NodeUpdateForm(request, None, prefix='node') return render(request, 'gui/system/maintenance.html', context)
def node_img_sources_sync(task_id, sender, **kwargs): """ Task for updating imgadm sources on one or every compute node. Called by dc_settings_changed signal. """ new_img_sources = ImageVm().sources for node in Node.all(): # We update imgadm sources only on online nodes # But we will also run run_node_img_sources_sync() whenever node status is changed to online because # the node_startup handler runs the sysinfo update task if not node.is_online(): logger.warn( 'Excluding node %s from updating imgadm sources because it is not in online state', node) continue run_node_img_sources_sync(node, new_img_sources=new_img_sources)
def vms(request, hostname, zpool=None): """ List of servers defined on this compute node, optionally filtered by storage (#952). """ context = collect_view_data(request, 'node_list') context['node'] = node = get_node(request, hostname) context['nodes'] = Node.all() context['node_online'] = node.is_online() context['can_edit'] = True context['storages'] = nss = node.nodestorage_set.select_related( 'storage').all().order_by('zpool') all_vms = node.vm_set.select_related( 'owner', 'dc', 'slavevm', 'slavevm__master_vm').order_by('hostname') context['vms_all_count'] = all_vms.count() _vms = [] if zpool and zpool not in {ns.zpool for ns in nss}: zpool = None for ns in nss: ns.vms_count = 0 for vm in all_vms: vm_zpools = vm.get_used_disk_pools() vm.resources = vm.get_cpu_ram_disk(zpool=zpool) for ns in nss: if ns.zpool in vm_zpools: ns.vms_count += 1 if zpool and zpool == ns.zpool: _vms.append(vm) if zpool: context['vms'] = _vms else: context['vms'] = all_vms context['zpool'] = zpool return render(request, 'gui/node/vms.html', context)
def mon_sync_all(task_id, dc_id, clear_cache=True, sync_groups=True, sync_nodes=True, sync_vms=True, **kwargs): """ Clear Zabbix cache and sync everything in Zabbix. Related to a specific DC. Triggered by dc_settings_changed signal. """ dc = Dc.objects.get_by_id(int(dc_id)) if clear_cache: logger.info('Clearing zabbix cache in DC %s', dc) mon_clear_zabbix_cache(dc) get_monitoring(dc) # Cache new Zabbix instance for tasks below if sync_groups: logger.info( 'Running monitoring group synchronization for all user groups in DC %s', dc) mon_all_groups_sync.call(task_id, dc_name=dc.name) if sync_nodes: logger.info( 'Running monitoring host synchronization for all compute nodes') for node in Node.all(): mon_node_sync.call(task_id, node_uuid=node.uuid) if sync_vms: logger.info( 'Running monitoring host synchronization for all VMs in DC %s', dc) for vm_uuid in dc.vm_set.values_list('uuid', flat=True): mon_vm_sync.call(task_id, vm_uuid=vm_uuid)
def choose_node(self, vm): """Used by POST vm_manage when node needs to be chosen automatically""" new_node = Node.choose(vm) err = 'Could not find node with free resources' if not new_node: raise ExpectationFailed(err) old_request = self.request self.request = set_request_method(old_request, 'PUT') try: res = self.put(vm, {'node': new_node.hostname}) finally: self.request = old_request if res.status_code != scode.HTTP_200_OK: try: err = res.data['result']['node'] except Exception as e: logger.exception(e) raise ExpectationFailed(err) return new_node
def validate(self, attrs): # noqa: R701 try: network = attrs['network'] except KeyError: network = self.object.network try: netmask = attrs['netmask'] except KeyError: netmask = self.object.netmask try: vxlan_id = attrs['vxlan_id'] except KeyError: vxlan_id = self.object.vxlan_id try: mtu = attrs['mtu'] except KeyError: mtu = self.object.mtu try: nic_tag = attrs['nic_tag'] except KeyError: nic_tag = self.object.nic_tag try: ip_network = Subnet.get_ip_network(network, netmask) if ip_network.is_reserved: raise ValueError except ValueError: self._errors['network'] = self._errors['netmask'] = \ s.ErrorList([_('Enter a valid IPv4 network and netmask.')]) if self.request.method == 'POST' and self._dc_bound: limit = self._dc_bound.settings.VMS_NET_LIMIT if limit is not None: if Subnet.objects.filter( dc_bound=self._dc_bound).count() >= int(limit): raise s.ValidationError( _('Maximum number of networks reached.')) nic_tag_type = Node.all_nictags()[nic_tag] # retrieve all available nictags and see what is the type of the current nic tag # if type is overlay then vxlan is mandatory argument if nic_tag_type == 'overlay rule': if not vxlan_id: self._errors['vxlan_id'] = s.ErrorList([ _('VXLAN ID is required when an ' 'overlay NIC tag is selected.') ]) else: attrs['vxlan_id'] = None # validate MTU for overlays and etherstubs, and physical nics if nic_tag_type == 'overlay rule': # if MTU was not set for the overlay if not mtu: attrs['mtu'] = 1400 if mtu > 8900: self._errors['mtu'] = s.ErrorList([ s.IntegerField.default_error_messages['max_value'] % { 'limit_value': 8900 } ]) if nic_tag_type in ('normal', 'aggr') and mtu and mtu < 1500: self._errors['mtu'] = s.ErrorList([ s.IntegerField.default_error_messages['min_value'] % { 'limit_value': 1500 } ]) if self._dc_bound: try: vlan_id = attrs['vlan_id'] except KeyError: vlan_id = self.object.vlan_id dc_settings = self._dc_bound.settings if dc_settings.VMS_NET_VLAN_RESTRICT and vlan_id not in dc_settings.VMS_NET_VLAN_ALLOWED: self._errors['vlan_id'] = s.ErrorList( [_('VLAN ID is not available in datacenter.')]) if dc_settings.VMS_NET_VXLAN_RESTRICT and vxlan_id not in dc_settings.VMS_NET_VXLAN_ALLOWED: self._errors['vxlan_id'] = s.ErrorList( [_('VXLAN ID is not available in datacenter.')]) return super(NetworkSerializer, self).validate(attrs)
def node_sysinfo_cb(result, task_id, node_uuid=None): """ A callback function for updating Node.json (sysinfo). node_uuid will be set only if called via API or GUI """ # in case the callback is called by restarting erigonesd:fast service on compute node, the meta dict lacks # a lot of information; msg is required as part of exception logging inside callback decorator # therefore we set it explicitly result['meta']['msg'] = LOG_NODE_UPDATE if result['returncode'] != 0: logger.error( 'Found nonzero return code in result from esysinfo command on %s', node_uuid) raise TaskException(result, 'Got bad return code (%s)' % result['returncode']) stdout = result.pop('stdout', '') result.pop('stderr', None) node_new = False try: esysinfo = parse_esysinfo(stdout) img_sources = esysinfo.pop('img_sources') img_initial = esysinfo.pop('img_initial') except Exception as e: logger.error( 'Could not parse output from esysinfo command on %s. Error: %s', node_uuid, e) logger.exception(e) raise TaskException(result, 'Could not parse esysinfo output') else: uuid = esysinfo['sysinfo']['UUID'] try: node = Node.objects.get(uuid=uuid) except Node.DoesNotExist: # The head node must be in online state during the admin DC initialization and each compute node must be in # online state during ssh key exchange. node_new = True is_head = not Node.objects.exists() logger.warn('Creating NEW node from sysinfo output from %s', node_uuid) node = Node.create_from_sysinfo(uuid, esysinfo, status=Node.ONLINE, is_head=is_head) node_created.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully created new compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) sshkey_changed = bool(node.sshkey) if node.is_head: logger.warn( 'New node %s is the first node ever created - assuming head node status. ' 'Initializing mgmt system and creating admin DC', node) from api.system.init import init_mgmt try: init_mgmt(node, images=img_initial) except Exception as e: logger.exception(e) result[ 'message'] = 'Error while initializing admin datacenter (%s)' % e task_log_error(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) logger.info('Saving node %s IP address "%s" into admin network', node, node.ip_address) try: # We should proceed even if the IP address is not registered node.ip_address.save() except Exception as e: logger.exception(e) else: admin_net = node.ip_address.subnet # The network was updated by init_mgmt() # Reload Subnet object because it is cached inside node instance admin_net = admin_net.__class__.objects.get(pk=admin_net.pk) # We need a request object request = get_dummy_request(DefaultDc(), 'POST', system_user=True) record_cls = RecordView.Record if admin_net.dns_domain and admin_net.dns_domain == node.domain_name: logger.info('Creating forward A DNS record for node %s', node) # This will fail silently RecordView.add_or_update_record(request, record_cls.A, admin_net.dns_domain, node.hostname, node.address, task_id=task_id, related_obj=node) if admin_net.ptr_domain: logger.info('Creating reverse PTR DNS record for node %s', node) # This will fail silently RecordView.add_or_update_record(request, record_cls.PTR, admin_net.ptr_domain, record_cls.get_reverse( node.address), node.hostname, task_id=task_id, related_obj=node) else: sshkey_changed = node.sshkey_changed(esysinfo) if node.sysinfo_changed(esysinfo) or sshkey_changed: logger.warn('Updating node %s json with sysinfo output from %s', node, node_uuid) node.update_from_sysinfo(esysinfo) # Will save public SSH key too node_json_changed.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully updated compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=True) else: result[ 'message'] = 'No changes detected on compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=True) if sshkey_changed: logger.warn( 'SSH key has changed on node %s - creating authorized_keys synchronization tasks', node) try: run_node_authorized_keys_sync() except Exception as e: logger.exception(e) try: run_node_img_sources_sync(node, node_img_sources=img_sources) except Exception as e: logger.exception(e) if node_new: node.del_initializing() # Used by esdc-ee to change node status to unlicensed node_status = getattr(settings, 'VMS_NODE_STATUS_DEFAULT', None) if node_status: node.save_status( node_status) # Set node status (most probably to unlicensed) else: # Always run vm_status_all for an old compute node vm_status_all(task_id, node) # Sync snapshots and backup for every node storage try: NodeVmSnapshotList.sync(node) except Exception as e: logger.exception(e) return result
def node_sysinfo_cb(result, task_id, node_uuid=None): """ A callback function for updating Node.json (sysinfo). node_uuid will be set only if called via API or GUI """ # in case the callback is called by restarting erigonesd:fast service on compute node, the meta dict lacks # a lot of information; msg is required as part of exception logging inside callback decorator # therefore we set it explicitly result['meta']['msg'] = LOG_NODE_UPDATE if result['returncode'] != 0: logger.error( 'Found nonzero return code in result from esysinfo command on %s', node_uuid) raise TaskException(result, 'Got bad return code (%s)' % result['returncode']) stdout = result.pop('stdout', '') result.pop('stderr', None) node_new = False try: esysinfo = parse_esysinfo(stdout) img_sources = esysinfo.pop('img_sources') img_initial = esysinfo.pop('img_initial') except Exception as e: logger.error( 'Could not parse output from esysinfo command on %s. Error: %s', node_uuid, e) logger.exception(e) raise TaskException(result, 'Could not parse esysinfo output') else: uuid = esysinfo['sysinfo']['UUID'] try: node = Node.objects.get(uuid=uuid) except Node.DoesNotExist: # The head node must be in online state during the admin DC initialization and each compute node must be in # online state during ssh key exchange. node_new = True is_head = not Node.objects.exists() logger.warn('Creating NEW node from sysinfo output from %s', node_uuid) node = Node.create_from_sysinfo(uuid, esysinfo, status=Node.ONLINE, is_head=is_head) node_created.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully created new compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) sshkey_changed = bool(node.sshkey) if node.is_head: logger.warn( 'New node %s is the first node ever created - assuming head node status. ' 'Initializing mgmt system and creating admin DC', node) from api.system.init import init_mgmt try: init_mgmt(node, images=img_initial) except Exception as e: logger.exception(e) result[ 'message'] = 'Error while initializing admin datacenter (%s)' % e task_log_error(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) try: _save_node_ip_address(task_id, node) except Exception as e: logger.exception(e) else: sshkey_changed = node.sshkey_changed(esysinfo) sysinfo_changed = node.sysinfo_changed(esysinfo) if sysinfo_changed or sshkey_changed: logger.warn('Updating node %s json with sysinfo output from %s', node, node_uuid) node.update_from_sysinfo(esysinfo) # Will save public SSH key too node_json_changed.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully updated compute node %s' % node.hostname else: node_json_unchanged.send(task_id, node=node) # Signal! result[ 'message'] = 'No changes detected on compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=True) if sshkey_changed: logger.warn( 'SSH key has changed on node %s - creating authorized_keys synchronization tasks', node) try: run_node_authorized_keys_sync() except Exception as e: logger.exception(e) try: run_node_img_sources_sync(node, node_img_sources=img_sources) except Exception as e: logger.exception(e) if node_new: node.del_initializing() # Used by esdc-ee to change node status to unlicensed node_status = getattr(settings, 'VMS_NODE_STATUS_DEFAULT', None) if node_status: node.save_status( node_status) # Set node status (most probably to unlicensed) else: # Always run vm_status_all for an old compute node vm_status_all(task_id, node) # Sync snapshots and backup for every node storage try: NodeVmSnapshotList.sync(node) except Exception as e: logger.exception(e) # Refresh cached version information + emit event informing about restarted erigonesd:fast try: del node.system_version # Sometimes the node worker does not respond within the given timeout so we have to try more than once for i in range(5): if node.system_version: break logger.info('Node %s has system version %s', node, node.system_version) if owner_id_from_task_id(task_id) == TASK_USER: # internal user ID NodeSystemRestarted(node, system_version=node.system_version).send() except Exception as e: logger.exception(e) return result
def __init__(self, request, net, *args, **kwargs): super(NetworkSerializer, self).__init__(request, net, *args, **kwargs) if not kwargs.get('many', False): self._dc_bound = net.dc_bound self.fields['owner'].queryset = get_owners(request, all=True) self.fields['nic_tag'].choices = Node.all_nictags_choices()
def monitoring(request, hostname, graph_type='cpu'): """ Compute node related monitoring. """ dc1_settings = get_dc1_settings(request) context = collect_view_data(request, 'node_list') context['node'] = node = get_node(request, hostname) context['nodes'] = Node.all() if not dc1_settings.MON_ZABBIX_NODE_SYNC: return render(request, 'gui/node/monitoring_disabled.html', context) from api.mon.node.graphs import GRAPH_ITEMS context['graph_items'] = GRAPH_ITEMS context['obj_lifetime'] = node.lifetime context[ 'obj_operational'] = node.status != Node.STATUS_AVAILABLE_MONITORING and ( not graph_type.startswith('vm-') or node.vm_set.exclude( status=Vm.NOTCREATED).filter(slavevm__isnull=True).exists()) if graph_type == 'memory': graphs = (Graph('mem-usage'), Graph('swap-usage')) elif graph_type == 'network': context['node_nics'] = node_nics = node.used_nics.keys() graphs = list( chain(*[(Graph('net-bandwidth', nic=i), Graph('net-packets', nic=i)) for i in node_nics])) elif graph_type == 'storage': context['zpools'] = node_zpools = node.zpools graphs = list( chain(*[(Graph('storage-throughput', zpool=i), Graph('storage-io', zpool=i), Graph('storage-space', zpool=i)) for i in node_zpools])) elif graph_type == 'vm-cpu': graphs = (Graph('vm-cpu-usage'), ) elif graph_type == 'vm-memory': graphs = (Graph('vm-mem-usage'), ) elif graph_type == 'vm-disk-throughput': graphs = ( Graph('vm-disk-logical-throughput-reads'), Graph('vm-disk-logical-throughput-writes'), Graph('vm-disk-physical-throughput-reads'), Graph('vm-disk-physical-throughput-writes'), ) elif graph_type == 'vm-disk-io': graphs = ( Graph('vm-disk-logical-io-reads'), Graph('vm-disk-logical-io-writes'), Graph('vm-disk-physical-io-reads'), Graph('vm-disk-physical-io-writes'), ) else: graph_type = 'cpu' graphs = ( Graph('cpu-usage'), Graph('cpu-jumps'), Graph('cpu-load'), ) context['graphs'] = graphs context['graph_type'] = graph_type return render(request, 'gui/node/monitoring_%s.html' % graph_type, context)