def add_volume(zkhandler, pool, name, size): # Add 'B' if the volume is in bytes if re.match(r"^[0-9]+$", size): size = "{}B".format(size) # 1. Verify the size of the volume pool_information = getPoolInformation(zkhandler, pool) size_bytes = format_bytes_fromhuman(size) if size_bytes >= int(pool_information["stats"]["free_bytes"]): return ( False, "ERROR: Requested volume size is greater than the available free space in the pool", ) # 2. Create the volume retcode, stdout, stderr = common.run_os_command( "rbd create --size {} {}/{}".format(size, pool, name)) if retcode: return False, 'ERROR: Failed to create RBD volume "{}": {}'.format( name, stderr) # 2. Get volume stats retcode, stdout, stderr = common.run_os_command( "rbd info --format json {}/{}".format(pool, name)) volstats = stdout # 3. Add the new volume to Zookeeper zkhandler.write([ (("volume", f"{pool}/{name}"), ""), (("volume.stats", f"{pool}/{name}"), volstats), (("snapshot", f"{pool}/{name}"), ""), ]) return True, 'Created RBD volume "{}/{}" ({}).'.format(pool, name, size)
def clone_volume(zkhandler, pool, name_src, name_new): if not verifyVolume(zkhandler, pool, name_src): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( name_src, pool) # 1. Clone the volume retcode, stdout, stderr = common.run_os_command( "rbd copy {}/{} {}/{}".format(pool, name_src, pool, name_new)) if retcode: return ( False, 'ERROR: Failed to clone RBD volume "{}" to "{}" in pool "{}": {}'. format(name_src, name_new, pool, stderr), ) # 2. Get volume stats retcode, stdout, stderr = common.run_os_command( "rbd info --format json {}/{}".format(pool, name_new)) volstats = stdout # 3. Add the new volume to Zookeeper zkhandler.write([ (("volume", f"{pool}/{name_new}"), ""), (("volume.stats", f"{pool}/{name_new}"), volstats), (("snapshot", f"{pool}/{name_new}"), ""), ]) return True, 'Cloned RBD volume "{}" to "{}" in pool "{}"'.format( name_src, name_new, pool)
def createNetworkBridged(self): self.logger.out( "Creating bridged vLAN device {} on interface {}".format( self.base_nic, self.bridge_dev), prefix="VNI {}".format(self.vni), state="i", ) # Create vLAN interface common.run_os_command( "ip link add link {} name {} type vlan id {}".format( self.bridge_dev, self.base_nic, self.vni)) # Create bridge interface common.run_os_command("brctl addbr {}".format(self.bridge_nic)) self.updateNetworkMTU() # Disable tx checksum offload on bridge interface (breaks DHCP on Debian < 9) common.run_os_command("ethtool -K {} tx off".format(self.bridge_nic)) # Disable IPv6 on bridge interface (prevents leakage) common.run_os_command("sysctl net.ipv6.conf.{}.disable_ipv6=1".format( self.bridge_nic)) # Add vLAN interface to bridge interface common.run_os_command("brctl addif {} {}".format( self.bridge_nic, self.base_nic))
def createNetworkManaged(self): self.logger.out( "Creating VXLAN device on interface {}".format(self.cluster_dev), prefix="VNI {}".format(self.vni), state="i", ) # Create VXLAN interface common.run_os_command( "ip link add {} type vxlan id {} dstport 4789 dev {}".format( self.base_nic, self.vni, self.cluster_dev)) # Create bridge interface common.run_os_command("brctl addbr {}".format(self.bridge_nic)) self.updateNetworkMTU() # Disable tx checksum offload on bridge interface (breaks DHCP on Debian < 9) common.run_os_command("ethtool -K {} tx off".format(self.bridge_nic)) # Disable IPv6 DAD on bridge interface common.run_os_command("sysctl net.ipv6.conf.{}.accept_dad=0".format( self.bridge_nic)) # Add VXLAN interface to bridge interface common.run_os_command("brctl addif {} {}".format( self.bridge_nic, self.base_nic))
def set_pgs_pool(zkhandler, name, pgs): if not verifyPool(zkhandler, name): return False, f'ERROR: No pool with name "{name}" is present in the cluster.' # Validate new PGs count pgs = int(pgs) if (pgs == 0) or (pgs & (pgs - 1) != 0): return ( False, f'ERROR: Invalid PGs number "{pgs}": must be a non-zero power of 2.', ) # Set the new pgs number retcode, stdout, stderr = common.run_os_command( f"ceph osd pool set {name} pg_num {pgs}") if retcode: return False, f"ERROR: Failed to set pg_num on pool {name} to {pgs}: {stderr}" # Set the new pgps number if increasing current_pgs = int(zkhandler.read(("pool.pgs", name))) if current_pgs >= pgs: retcode, stdout, stderr = common.run_os_command( f"ceph osd pool set {name} pgp_num {pgs}") if retcode: return ( False, f"ERROR: Failed to set pg_num on pool {name} to {pgs}: {stderr}", ) # Update Zookeeper count zkhandler.write([ (("pool.pgs", name), pgs), ]) return True, f'Set PGs count to {pgs} for RBD pool "{name}".'
def rename_volume(zkhandler, pool, name, new_name): if not verifyVolume(zkhandler, pool, name): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( name, pool) # 1. Rename the volume retcode, stdout, stderr = common.run_os_command( "rbd rename {}/{} {}".format(pool, name, new_name)) if retcode: return ( False, 'ERROR: Failed to rename volume "{}" to "{}" in pool "{}": {}'. format(name, new_name, pool, stderr), ) # 2. Rename the volume in Zookeeper zkhandler.rename([ (("volume", f"{pool}/{name}"), ("volume", f"{pool}/{new_name}")), (("snapshot", f"{pool}/{name}"), ("snapshot", f"{pool}/{new_name}")), ]) # 3. Get volume stats retcode, stdout, stderr = common.run_os_command( "rbd info --format json {}/{}".format(pool, new_name)) volstats = stdout # 4. Update the volume stats in Zookeeper zkhandler.write([ (("volume.stats", f"{pool}/{new_name}"), volstats), ]) return True, 'Renamed RBD volume "{}" to "{}" in pool "{}".'.format( name, new_name, pool)
def updateNetworkMTU(self): self.logger.out( "Setting network MTU to {}".format(self.vx_mtu), prefix="VNI {}".format(self.vni), state="i", ) # Set MTU of base and bridge NICs common.run_os_command("ip link set {} mtu {} up".format( self.base_nic, self.vx_mtu)) common.run_os_command("ip link set {} mtu {} up".format( self.bridge_nic, self.vx_mtu))
def removeNetworkManaged(self): self.logger.out( "Removing VNI device on interface {}".format(self.cluster_dev), prefix="VNI {}".format(self.vni), state="i", ) common.run_os_command("ip link set {} down".format(self.bridge_nic)) common.run_os_command("ip link set {} down".format(self.base_nic)) common.run_os_command("brctl delif {} {}".format( self.bridge_nic, self.base_nic)) common.run_os_command("brctl delbr {}".format(self.bridge_nic)) common.run_os_command("ip link delete {}".format(self.base_nic))
def create_osd_db_lv(zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes): logger.out( "Creating new OSD database logical volume for OSD ID {}".format( osd_id), state="i", ) try: # 0. Check if an existsing logical volume exists retcode, stdout, stderr = common.run_os_command( "lvdisplay osd-db/osd{}".format(osd_id)) if retcode != 5: logger.out( 'Ceph OSD database LV "osd-db/osd{}" already exists'. format(osd_id), state="e", ) return False # 1. Determine LV sizing osd_db_size = int(osd_size_bytes * ext_db_ratio / 1024 / 1024) # 2. Create the LV logger.out( 'Creating DB LV "osd-db/osd-{}" of {}M ({} * {})'.format( osd_id, osd_db_size, osd_size_bytes, ext_db_ratio), state="i", ) retcode, stdout, stderr = common.run_os_command( "lvcreate --yes --name osd-{} --size {} osd-db".format( osd_id, osd_db_size)) if retcode: print("db lv creation") print(stdout) print(stderr) raise Exception # Log it logger.out( 'Created new OSD database logical volume "osd-db/osd-{}"'. format(osd_id), state="o", ) return True except Exception as e: # Log it logger.out( "Failed to create OSD database logical volume: {}".format(e), state="e") return False
def add_snapshot(zkhandler, pool, volume, name): if not verifyVolume(zkhandler, pool, volume): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( volume, pool) # 1. Create the snapshot retcode, stdout, stderr = common.run_os_command( "rbd snap create {}/{}@{}".format(pool, volume, name)) if retcode: return ( False, 'ERROR: Failed to create RBD snapshot "{}" of volume "{}" in pool "{}": {}' .format(name, volume, pool, stderr), ) # 2. Add the snapshot to Zookeeper zkhandler.write([ (("snapshot", f"{pool}/{volume}/{name}"), ""), (("snapshot.stats", f"{pool}/{volume}/{name}"), "{}"), ]) # 3. Update the count of snapshots on this volume volume_stats_raw = zkhandler.read(("volume.stats", f"{pool}/{volume}")) volume_stats = dict(json.loads(volume_stats_raw)) # Format the size to something nicer volume_stats["snapshot_count"] = volume_stats["snapshot_count"] + 1 volume_stats_raw = json.dumps(volume_stats) zkhandler.write([ (("volume.stats", f"{pool}/{volume}"), volume_stats_raw), ]) return True, 'Created RBD snapshot "{}" of volume "{}" in pool "{}".'.format( name, volume, pool)
def remove_snapshot(zkhandler, pool, volume, name): if not verifyVolume(zkhandler, pool, volume): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( volume, pool) if not verifySnapshot(zkhandler, pool, volume, name): return ( False, 'ERROR: No snapshot with name "{}" is present of volume {} in pool {}.' .format(name, volume, pool), ) # 1. Remove the snapshot retcode, stdout, stderr = common.run_os_command( "rbd snap rm {}/{}@{}".format(pool, volume, name)) if retcode: return ( False, 'Failed to remove RBD snapshot "{}" of volume "{}" in pool "{}": {}' .format(name, volume, pool, stderr), ) # 2. Delete snapshot from Zookeeper zkhandler.delete([("snapshot", f"{pool}/{volume}/{name}")]) # 3. Update the count of snapshots on this volume volume_stats_raw = zkhandler.read(("volume.stats", f"{pool}/{volume}")) volume_stats = dict(json.loads(volume_stats_raw)) # Format the size to something nicer volume_stats["snapshot_count"] = volume_stats["snapshot_count"] - 1 volume_stats_raw = json.dumps(volume_stats) zkhandler.write([(("volume.stats", f"{pool}/{volume}"), volume_stats_raw)]) return True, 'Removed RBD snapshot "{}" of volume "{}" in pool "{}".'.format( name, volume, pool)
def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password): ipmi_command = f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status" retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2) if retcode == 0 and stdout.strip() == "Chassis Power is on": return True else: return False
def remove_pool(zkhandler, name): if not verifyPool(zkhandler, name): return False, 'ERROR: No pool with name "{}" is present in the cluster.'.format( name) # 1. Remove pool volumes for volume in zkhandler.children(("volume", name)): remove_volume(zkhandler, name, volume) # 2. Remove the pool retcode, stdout, stderr = common.run_os_command( "ceph osd pool rm {pool} {pool} --yes-i-really-really-mean-it".format( pool=name)) if retcode: return False, 'ERROR: Failed to remove pool "{}": {}'.format( name, stderr) # 3. Delete pool from Zookeeper zkhandler.delete([ ("pool", name), ("volume", name), ("snapshot", name), ]) return True, 'Removed RBD pool "{}" and all volumes.'.format(name)
def rename_snapshot(zkhandler, pool, volume, name, new_name): if not verifyVolume(zkhandler, pool, volume): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( volume, pool) if not verifySnapshot(zkhandler, pool, volume, name): return ( False, 'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".' .format(name, volume, pool), ) # 1. Rename the snapshot retcode, stdout, stderr = common.run_os_command( "rbd snap rename {pool}/{volume}@{name} {pool}/{volume}@{new_name}". format(pool=pool, volume=volume, name=name, new_name=new_name)) if retcode: return ( False, 'ERROR: Failed to rename RBD snapshot "{}" to "{}" for volume "{}" in pool "{}": {}' .format(name, new_name, volume, pool, stderr), ) # 2. Rename the snapshot in ZK zkhandler.rename([ ( ("snapshot", f"{pool}/{volume}/{name}"), ("snapshot", f"{pool}/{volume}/{new_name}"), ), ]) return ( True, 'Renamed RBD snapshot "{}" to "{}" for volume "{}" in pool "{}".'. format(name, new_name, volume, pool), )
def setup_sriov(logger, config): logger.out("Setting up SR-IOV device support", state="i") # Enable unsafe interrupts for the vfio_iommu_type1 kernel module try: common.run_os_command( "modprobe vfio_iommu_type1 allow_unsafe_interrupts=1") with open( "/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts", "w") as mfh: mfh.write("Y") except Exception: logger.out( "Failed to enable vfio_iommu_type1 kernel module; SR-IOV may fail", state="w", ) # Loop through our SR-IOV NICs and enable the numvfs for each for device in config["sriov_device"]: logger.out( f'Preparing SR-IOV PF {device["phy"]} with {device["vfcount"]} VFs', state="i", ) try: with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', "r") as vfh: current_vf_count = vfh.read().strip() with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', "w") as vfh: vfh.write(str(device["vfcount"])) except FileNotFoundError: logger.out( f'Failed to open SR-IOV configuration for PF {device["phy"]}; device may not support SR-IOV', state="w", ) except OSError: logger.out( f'Failed to set SR-IOV VF count for PF {device["phy"]} to {device["vfcount"]}; already set to {current_vf_count}', state="w", ) if device.get("mtu", None) is not None: logger.out( f'Setting SR-IOV PF {device["phy"]} to MTU {device["mtu"]}', state="i") common.run_os_command( f'ip link set {device["phy"]} mtu {device["mtu"]} up')
def unset_osd(zkhandler, option): retcode, stdout, stderr = common.run_os_command( "ceph osd unset {}".format(option)) if retcode: return False, 'ERROR: Failed to unset property "{}": {}'.format( option, stderr) return True, 'Unset OSD property "{}".'.format(option)
def watch_vf_trust(data, stat, event=""): if event and event.type == "DELETED": # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: data = data.decode("ascii") except AttributeError: data = "off" if data != self.trust: self.trust = data self.logger.out( "Setting trust mode {}".format(boolToOnOff(self.trust)), state="i", prefix="SR-IOV VF {}".format(self.vf), ) common.run_os_command("ip link set {} vf {} trust {}".format( self.pf, self.vfid, boolToOnOff(self.trust)))
def watch_vf_vlan_id(data, stat, event=""): if event and event.type == "DELETED": # The key has been deleted after existing before; terminate this watcher # because this class instance is about to be reaped in Daemon.py return False try: data = data.decode("ascii") except AttributeError: data = "0" if data != self.vlan_id: self.vlan_id = data self.logger.out( "Setting vLAN ID to {}".format(self.vlan_id), state="i", prefix="SR-IOV VF {}".format(self.vf), ) common.run_os_command( "ip link set {} vf {} vlan {} qos {}".format( self.pf, self.vfid, self.vlan_id, self.vlan_qos))
def out_osd(zkhandler, osd_id): if not verifyOSD(zkhandler, osd_id): return False, 'ERROR: No OSD with ID "{}" is present in the cluster.'.format( osd_id) retcode, stdout, stderr = common.run_os_command( "ceph osd out {}".format(osd_id)) if retcode: return False, "ERROR: Failed to disable OSD {}: {}".format( osd_id, stderr) return True, "Set OSD {} offline.".format(osd_id)
def remove_volume(zkhandler, pool, name): if not verifyVolume(zkhandler, pool, name): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( name, pool) # 1. Remove volume snapshots for snapshot in zkhandler.children(("snapshot", f"{pool}/{name}")): remove_snapshot(zkhandler, pool, name, snapshot) # 2. Remove the volume retcode, stdout, stderr = common.run_os_command("rbd rm {}/{}".format( pool, name)) if retcode: return False, 'ERROR: Failed to remove RBD volume "{}" in pool "{}": {}'.format( name, pool, stderr) # 3. Delete volume from Zookeeper zkhandler.delete([ ("volume", f"{pool}/{name}"), ("snapshot", f"{pool}/{name}"), ]) return True, 'Removed RBD volume "{}" in pool "{}".'.format(name, pool)
def unmap_volume(zkhandler, pool, name): if not verifyVolume(zkhandler, pool, name): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( name, pool) mapped_volume = "/dev/rbd/{}/{}".format(pool, name) # 1. Ensure the volume exists if not os.path.exists(mapped_volume): return ( False, 'ERROR: Mapped volume not found at expected location "{}".'.format( mapped_volume), ) # 2. Unap the volume retcode, stdout, stderr = common.run_os_command( "rbd unmap {}".format(mapped_volume)) if retcode: return False, 'ERROR: Failed to unmap RBD volume at "{}": {}'.format( mapped_volume, stderr) return True, 'Unmapped RBD volume at "{}".'.format(mapped_volume)
def map_volume(zkhandler, pool, name): if not verifyVolume(zkhandler, pool, name): return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format( name, pool) # 1. Map the volume onto the local system retcode, stdout, stderr = common.run_os_command("rbd map {}/{}".format( pool, name)) if retcode: return False, 'ERROR: Failed to map RBD volume "{}" in pool "{}": {}'.format( name, pool, stderr) # 2. Calculate the absolute path to the mapped volume mapped_volume = "/dev/rbd/{}/{}".format(pool, name) # 3. Ensure the volume exists if not os.path.exists(mapped_volume): return ( False, 'ERROR: Mapped volume not found at expected location "{}".'.format( mapped_volume), ) return True, mapped_volume
def ceph_volume_upload(zkhandler, pool, volume, img_type): """ Upload a raw file via HTTP post to a PVC Ceph volume """ # Determine the image conversion options if img_type not in ["raw", "vmdk", "qcow2", "qed", "vdi", "vpc"]: output = {"message": "Image type '{}' is not valid.".format(img_type)} retcode = 400 return output, retcode # Get the size of the target block device retcode, retdata = pvc_ceph.get_list_volume(zkhandler, pool, volume, is_fuzzy=False) # If there's no target, return failure if not retcode or len(retdata) < 1: output = { "message": "Target volume '{}' does not exist in pool '{}'.".format( volume, pool) } retcode = 400 return output, retcode dev_size = retdata[0]["stats"]["size"] def cleanup_maps_and_volumes(): # Unmap the target blockdev retflag, retdata = pvc_ceph.unmap_volume(zkhandler, pool, volume) # Unmap the temporary blockdev retflag, retdata = pvc_ceph.unmap_volume(zkhandler, pool, "{}_tmp".format(volume)) # Remove the temporary blockdev retflag, retdata = pvc_ceph.remove_volume(zkhandler, pool, "{}_tmp".format(volume)) # Create a temporary block device to store non-raw images if img_type == "raw": # Map the target blockdev retflag, retdata = pvc_ceph.map_volume(zkhandler, pool, volume) if not retflag: output = {"message": retdata.replace('"', "'")} retcode = 400 cleanup_maps_and_volumes() return output, retcode dest_blockdev = retdata # Save the data to the blockdev directly try: # This sets up a custom stream_factory that writes directly into the ova_blockdev, # rather than the standard stream_factory which writes to a temporary file waiting # on a save() call. This will break if the API ever uploaded multiple files, but # this is an acceptable workaround. def image_stream_factory(total_content_length, filename, content_type, content_length=None): return open(dest_blockdev, "wb") parse_form_data(flask.request.environ, stream_factory=image_stream_factory) except Exception: output = { "message": "Failed to upload or write image file to temporary volume." } retcode = 400 cleanup_maps_and_volumes() return output, retcode output = { "message": "Wrote uploaded file to volume '{}' in pool '{}'.".format( volume, pool) } retcode = 200 cleanup_maps_and_volumes() return output, retcode # Write the image directly to the blockdev else: # Create a temporary blockdev retflag, retdata = pvc_ceph.add_volume(zkhandler, pool, "{}_tmp".format(volume), dev_size) if not retflag: output = {"message": retdata.replace('"', "'")} retcode = 400 cleanup_maps_and_volumes() return output, retcode # Map the temporary target blockdev retflag, retdata = pvc_ceph.map_volume(zkhandler, pool, "{}_tmp".format(volume)) if not retflag: output = {"message": retdata.replace('"', "'")} retcode = 400 cleanup_maps_and_volumes() return output, retcode temp_blockdev = retdata # Map the target blockdev retflag, retdata = pvc_ceph.map_volume(zkhandler, pool, volume) if not retflag: output = {"message": retdata.replace('"', "'")} retcode = 400 cleanup_maps_and_volumes() return output, retcode dest_blockdev = retdata # Save the data to the temporary blockdev directly try: # This sets up a custom stream_factory that writes directly into the ova_blockdev, # rather than the standard stream_factory which writes to a temporary file waiting # on a save() call. This will break if the API ever uploaded multiple files, but # this is an acceptable workaround. def image_stream_factory(total_content_length, filename, content_type, content_length=None): return open(temp_blockdev, "wb") parse_form_data(flask.request.environ, stream_factory=image_stream_factory) except Exception: output = { "message": "Failed to upload or write image file to temporary volume." } retcode = 400 cleanup_maps_and_volumes() return output, retcode # Convert from the temporary to destination format on the blockdevs retcode, stdout, stderr = pvc_common.run_os_command( "qemu-img convert -C -f {} -O raw {} {}".format( img_type, temp_blockdev, dest_blockdev)) if retcode: output = { "message": "Failed to convert image format from '{}' to 'raw': {}".format( img_type, stderr) } retcode = 400 cleanup_maps_and_volumes() return output, retcode output = { "message": "Converted and wrote uploaded file to volume '{}' in pool '{}'.". format(volume, pool) } retcode = 200 cleanup_maps_and_volumes() return output, retcode
def setup_interfaces(logger, config): # Set up the Cluster interface cluster_dev = config["cluster_dev"] cluster_mtu = config["cluster_mtu"] cluster_dev_ip = config["cluster_dev_ip"] logger.out( f"Setting up Cluster network interface {cluster_dev} with MTU {cluster_mtu}", state="i", ) common.run_os_command(f"ip link set {cluster_dev} mtu {cluster_mtu} up") logger.out( f"Setting up Cluster network bridge on interface {cluster_dev} with IP {cluster_dev_ip}", state="i", ) common.run_os_command("brctl addbr brcluster") common.run_os_command(f"brctl addif brcluster {cluster_dev}") common.run_os_command(f"ip link set brcluster mtu {cluster_mtu} up") common.run_os_command(f"ip address add {cluster_dev_ip} dev brcluster") # Set up the Storage interface storage_dev = config["storage_dev"] storage_mtu = config["storage_mtu"] storage_dev_ip = config["storage_dev_ip"] logger.out( f"Setting up Storage network interface {storage_dev} with MTU {storage_mtu}", state="i", ) common.run_os_command(f"ip link set {storage_dev} mtu {storage_mtu} up") if storage_dev == cluster_dev: if storage_dev_ip != cluster_dev_ip: logger.out( f"Setting up Storage network on Cluster network bridge with IP {storage_dev_ip}", state="i", ) common.run_os_command( f"ip address add {storage_dev_ip} dev brcluster") else: logger.out( f"Setting up Storage network bridge on interface {storage_dev} with IP {storage_dev_ip}", state="i", ) common.run_os_command("brctl addbr brstorage") common.run_os_command(f"brctl addif brstorage {storage_dev}") common.run_os_command(f"ip link set brstorage mtu {storage_mtu} up") common.run_os_command(f"ip address add {storage_dev_ip} dev brstorage") # Set up the Upstream interface upstream_dev = config["upstream_dev"] upstream_mtu = config["upstream_mtu"] upstream_dev_ip = config["upstream_dev_ip"] logger.out( f"Setting up Upstream network interface {upstream_dev} with MTU {upstream_mtu}", state="i", ) if upstream_dev == cluster_dev: if upstream_dev_ip != cluster_dev_ip: logger.out( f"Setting up Upstream network on Cluster network bridge with IP {upstream_dev_ip}", state="i", ) common.run_os_command( f"ip address add {upstream_dev_ip} dev brcluster") else: logger.out( f"Setting up Upstream network bridge on interface {upstream_dev} with IP {upstream_dev_ip}", state="i", ) common.run_os_command("brctl addbr brupstream") common.run_os_command(f"brctl addif brupstream {upstream_dev}") common.run_os_command(f"ip link set brupstream mtu {upstream_mtu} up") common.run_os_command( f"ip address add {upstream_dev_ip} dev brupstream") upstream_gateway = config["upstream_gateway"] if upstream_gateway is not None: logger.out( f"Setting up Upstream network default gateway IP {upstream_gateway}", state="i", ) if upstream_dev == cluster_dev: common.run_os_command( f"ip route add default via {upstream_gateway} dev brcluster") else: common.run_os_command( f"ip route add default via {upstream_gateway} dev brupstream") # Set up sysctl tweaks to optimize networking # Enable routing functions common.run_os_command("sysctl net.ipv4.ip_forward=1") common.run_os_command("sysctl net.ipv6.ip_forward=1") # Enable send redirects common.run_os_command("sysctl net.ipv4.conf.all.send_redirects=1") common.run_os_command("sysctl net.ipv4.conf.default.send_redirects=1") common.run_os_command("sysctl net.ipv6.conf.all.send_redirects=1") common.run_os_command("sysctl net.ipv6.conf.default.send_redirects=1") # Accept source routes common.run_os_command("sysctl net.ipv4.conf.all.accept_source_route=1") common.run_os_command("sysctl net.ipv4.conf.default.accept_source_route=1") common.run_os_command("sysctl net.ipv6.conf.all.accept_source_route=1") common.run_os_command("sysctl net.ipv6.conf.default.accept_source_route=1") # Disable RP filtering on Cluster and Upstream interfaces (to allow traffic pivoting) common.run_os_command(f"sysctl net.ipv4.conf.{cluster_dev}.rp_filter=0") common.run_os_command("sysctl net.ipv4.conf.brcluster.rp_filter=0") common.run_os_command(f"sysctl net.ipv4.conf.{upstream_dev}.rp_filter=0") common.run_os_command("sysctl net.ipv4.conf.brupstream.rp_filter=0") common.run_os_command(f"sysctl net.ipv6.conf.{cluster_dev}.rp_filter=0") common.run_os_command("sysctl net.ipv6.conf.brcluster.rp_filter=0") common.run_os_command(f"sysctl net.ipv6.conf.{upstream_dev}.rp_filter=0") common.run_os_command("sysctl net.ipv6.conf.brupstream.rp_filter=0") # Stop DNSMasq if it is running common.run_os_command("systemctl stop dnsmasq.service") logger.out("Waiting 3 seconds for networking to come up", state="s") sleep(3)
def become_secondary(self): """ Relinquish primary coordinator status to a peer node """ time.sleep(0.2) # Initial delay for the first writer to grab the lock # Synchronize nodes A (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase A", state="i") lock.acquire() self.logger.out("Acquired read lock for synchronization phase A", state="o") self.logger.out("Releasing read lock for synchronization phase A", state="i") lock.release() self.logger.out("Released read lock for synchronization phase A", state="o") # Synchronize nodes B (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase B", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase B", state="o") time.sleep(0.2) # Time fir reader to acquire the lock # 1. Stop DNS aggregator self.dns_aggregator.stop_aggregator() # 2. Stop DHCP servers for network in self.d_network: self.d_network[network].stopDHCPServer() self.logger.out("Releasing write lock for synchronization phase B", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase B", state="o") # 3. Stop client API if self.config["enable_api"]: self.logger.out("Stopping PVC API client service", state="i") common.run_os_command("systemctl stop pvcapid.service") common.run_os_command("systemctl disable pvcapid.service") # 4. Stop metadata API self.metadata_api.stop() time.sleep(0.1) # Time fir new writer to acquire the lock # Synchronize nodes C (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase C", state="i") lock.acquire() self.logger.out("Acquired read lock for synchronization phase C", state="o") # 5. Remove Upstream floating IP self.logger.out( "Removing floating upstream IP {}/{} from interface {}".format( self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream" ), state="o", ) common.removeIPAddress( self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream" ) self.logger.out("Releasing read lock for synchronization phase C", state="i") lock.release() self.logger.out("Released read lock for synchronization phase C", state="o") # Synchronize nodes D (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase D", state="i") lock.acquire() self.logger.out("Acquired read lock for synchronization phase D", state="o") # 6. Remove Cluster & Storage floating IP self.logger.out( "Removing floating management IP {}/{} from interface {}".format( self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster" ), state="o", ) common.removeIPAddress( self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster" ) self.logger.out( "Removing floating storage IP {}/{} from interface {}".format( self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage" ), state="o", ) common.removeIPAddress( self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage" ) self.logger.out("Releasing read lock for synchronization phase D", state="i") lock.release() self.logger.out("Released read lock for synchronization phase D", state="o") # Synchronize nodes E (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase E", state="i") lock.acquire() self.logger.out("Acquired read lock for synchronization phase E", state="o") # 7. Remove Metadata link-local IP self.logger.out( "Removing Metadata link-local IP {}/{} from interface {}".format( "169.254.169.254", "32", "lo" ), state="o", ) common.removeIPAddress("169.254.169.254", "32", "lo") self.logger.out("Releasing read lock for synchronization phase E", state="i") lock.release() self.logger.out("Released read lock for synchronization phase E", state="o") # Synchronize nodes F (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase F", state="i") lock.acquire() self.logger.out("Acquired read lock for synchronization phase F", state="o") # 8. Remove gateway IPs for network in self.d_network: self.d_network[network].removeGateways() self.logger.out("Releasing read lock for synchronization phase F", state="i") lock.release() self.logger.out("Released read lock for synchronization phase F", state="o") # Synchronize nodes G (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase G", state="i") try: lock.acquire(timeout=60) # Don't wait forever and completely block us self.logger.out("Acquired read lock for synchronization phase G", state="o") except Exception: pass self.logger.out("Releasing read lock for synchronization phase G", state="i") lock.release() self.logger.out("Released read lock for synchronization phase G", state="o") # Wait 2 seconds for everything to stabilize before we declare all-done time.sleep(2) self.zkhandler.write([(("node.state.router", self.name), "secondary")]) self.logger.out( "Node {} transitioned to secondary state".format(self.name), state="o" )
def become_primary(self): """ Acquire primary coordinator status from a peer node """ # Lock the primary node until transition is complete primary_lock = self.zkhandler.exclusivelock("base.config.primary_node") primary_lock.acquire() # Ensure our lock key is populated self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) # Synchronize nodes A (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase A", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase A", state="o") time.sleep(1) # Time fir reader to acquire the lock self.logger.out("Releasing write lock for synchronization phase A", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase A", state="o") time.sleep(0.1) # Time fir new writer to acquire the lock # Synchronize nodes B (I am reader) lock = self.zkhandler.readlock("base.config.primary_node.sync_lock") self.logger.out("Acquiring read lock for synchronization phase B", state="i") lock.acquire() self.logger.out("Acquired read lock for synchronization phase B", state="o") self.logger.out("Releasing read lock for synchronization phase B", state="i") lock.release() self.logger.out("Released read lock for synchronization phase B", state="o") # Synchronize nodes C (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase C", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase C", state="o") time.sleep(0.5) # Time fir reader to acquire the lock # 1. Add Upstream floating IP self.logger.out( "Creating floating upstream IP {}/{} on interface {}".format( self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream" ), state="o", ) common.createIPAddress( self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream" ) self.logger.out("Releasing write lock for synchronization phase C", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase C", state="o") # Synchronize nodes D (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase D", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase D", state="o") time.sleep(0.2) # Time fir reader to acquire the lock # 2. Add Cluster & Storage floating IP self.logger.out( "Creating floating management IP {}/{} on interface {}".format( self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster" ), state="o", ) common.createIPAddress( self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster" ) self.logger.out( "Creating floating storage IP {}/{} on interface {}".format( self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage" ), state="o", ) common.createIPAddress( self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage" ) self.logger.out("Releasing write lock for synchronization phase D", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase D", state="o") # Synchronize nodes E (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase E", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase E", state="o") time.sleep(0.2) # Time fir reader to acquire the lock # 3. Add Metadata link-local IP self.logger.out( "Creating Metadata link-local IP {}/{} on interface {}".format( "169.254.169.254", "32", "lo" ), state="o", ) common.createIPAddress("169.254.169.254", "32", "lo") self.logger.out("Releasing write lock for synchronization phase E", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase E", state="o") # Synchronize nodes F (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase F", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase F", state="o") time.sleep(0.2) # Time fir reader to acquire the lock # 4. Add gateway IPs for network in self.d_network: self.d_network[network].createGateways() self.logger.out("Releasing write lock for synchronization phase F", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase F", state="o") # Synchronize nodes G (I am writer) lock = self.zkhandler.writelock("base.config.primary_node.sync_lock") self.logger.out("Acquiring write lock for synchronization phase G", state="i") lock.acquire() self.logger.out("Acquired write lock for synchronization phase G", state="o") time.sleep(0.2) # Time fir reader to acquire the lock # 5. Transition Patroni primary self.logger.out("Setting Patroni leader to this node", state="i") tick = 1 patroni_failed = True # As long as we're in takeover, keep trying to set the Patroni leader to us while self.router_state == "takeover": # Switch Patroni leader to the local instance retcode, stdout, stderr = common.run_os_command( """ patronictl -c /etc/patroni/config.yml switchover --candidate {} --force pvc """.format( self.name ) ) # Combine the stdout and stderr and strip the output # Patronictl's output is pretty junky if stderr: stdout += stderr stdout = stdout.strip() # Handle our current Patroni leader being us if stdout and stdout.split("\n")[-1].split() == [ "Error:", "Switchover", "target", "and", "source", "are", "the", "same.", ]: self.logger.out( "Failed to switch Patroni leader to ourselves; this is fine\n{}".format( stdout ), state="w", ) patroni_failed = False break # Handle a failed switchover elif stdout and ( stdout.split("\n")[-1].split()[:2] == ["Switchover", "failed,"] or stdout.strip().split("\n")[-1].split()[:1] == ["Error"] ): if tick > 4: self.logger.out( "Failed to switch Patroni leader after 5 tries; aborting", state="e", ) break else: self.logger.out( "Failed to switch Patroni leader; retrying [{}/5]\n{}\n".format( tick, stdout ), state="e", ) tick += 1 time.sleep(5) # Otherwise, we succeeded else: self.logger.out( "Successfully switched Patroni leader\n{}".format(stdout), state="o" ) patroni_failed = False time.sleep(0.2) break # 6. Start client API (and provisioner worker) if self.config["enable_api"]: self.logger.out("Starting PVC API client service", state="i") common.run_os_command("systemctl enable pvcapid.service") common.run_os_command("systemctl start pvcapid.service") self.logger.out("Starting PVC Provisioner Worker service", state="i") common.run_os_command("systemctl start pvcapid-worker.service") # 7. Start metadata API; just continue if we fail self.metadata_api.start() # 8. Start DHCP servers for network in self.d_network: self.d_network[network].startDHCPServer() # 9. Start DNS aggregator; just continue if we fail if not patroni_failed: self.dns_aggregator.start_aggregator() else: self.logger.out( "Not starting DNS aggregator due to Patroni failures", state="e" ) self.logger.out("Releasing write lock for synchronization phase G", state="i") self.zkhandler.write([("base.config.primary_node.sync_lock", "")]) lock.release() self.logger.out("Released write lock for synchronization phase G", state="o") # Wait 2 seconds for everything to stabilize before we declare all-done time.sleep(2) primary_lock.release() self.zkhandler.write([(("node.state.router", self.name), "primary")]) self.logger.out( "Node {} transitioned to primary state".format(self.name), state="o" )
def entrypoint(): keepalive_timer = None # Get our configuration config = pvcnoded.util.config.get_configuration() config["pvcnoded_version"] = version # Set some useful booleans for later (fewer characters) debug = config["debug"] if debug: print("DEBUG MODE ENABLED") # Create and validate our directories pvcnoded.util.config.validate_directories(config) # Set up the logger instance logger = log.Logger(config) # Print our startup message logger.out("") logger.out("|----------------------------------------------------------|") logger.out("| |") logger.out("| ███████████ ▜█▙ ▟█▛ █████ █ █ █ |") logger.out("| ██ ▜█▙ ▟█▛ ██ |") logger.out("| ███████████ ▜█▙ ▟█▛ ██ |") logger.out("| ██ ▜█▙▟█▛ ███████████ |") logger.out("| |") logger.out("|----------------------------------------------------------|") logger.out( "| Parallel Virtual Cluster node daemon v{0: <18} |".format(version)) logger.out("| Debug: {0: <49} |".format(str(config["debug"]))) logger.out("| FQDN: {0: <50} |".format(config["node_fqdn"])) logger.out("| Host: {0: <50} |".format(config["node_hostname"])) logger.out("| ID: {0: <52} |".format(config["node_id"])) logger.out("| IPMI hostname: {0: <41} |".format(config["ipmi_hostname"])) logger.out("| Machine details: |") logger.out("| CPUs: {0: <48} |".format(config["static_data"][0])) logger.out("| Arch: {0: <48} |".format(config["static_data"][3])) logger.out("| OS: {0: <50} |".format(config["static_data"][2])) logger.out("| Kernel: {0: <46} |".format(config["static_data"][1])) logger.out("|----------------------------------------------------------|") logger.out("") logger.out(f'Starting pvcnoded on host {config["node_fqdn"]}', state="s") if config["enable_networking"]: if config["enable_sriov"]: # Set up SR-IOV devices pvcnoded.util.networking.setup_sriov(logger, config) # Set up our interfaces pvcnoded.util.networking.setup_interfaces(logger, config) # Get list of coordinator nodes coordinator_nodes = config["coordinators"] if config["node_hostname"] in coordinator_nodes: # We are indeed a coordinator node config["daemon_mode"] = "coordinator" logger.out( f"This node is a {logger.fmt_blue}coordinator{logger.fmt_end}", state="i") else: # We are a hypervisor node config["daemon_mode"] = "hypervisor" logger.out( f"This node is a {logger.fmt_cyan}hypervisor{logger.fmt_end}", state="i") pvcnoded.util.services.start_system_services(logger, config) # Connect to Zookeeper and return our handler and current schema version zkhandler, node_schema_version = pvcnoded.util.zookeeper.connect( logger, config) # Watch for a global schema update and fire # This will only change by the API when triggered after seeing all nodes can update @zkhandler.zk_conn.DataWatch(zkhandler.schema.path("base.schema.version")) def update_schema(new_schema_version, stat, event=""): nonlocal zkhandler, keepalive_timer, node_schema_version try: new_schema_version = int(new_schema_version.decode("ascii")) except Exception: new_schema_version = 0 if new_schema_version == node_schema_version: return True logger.out("Hot update of schema version started", state="s") logger.out( f"Current version: {node_schema_version,} New version: {new_schema_version}", state="s", ) # Prevent any keepalive updates while this happens if keepalive_timer is not None: pvcnoded.util.keepalive.stop_keepalive_timer( logger, keepalive_timer) sleep(1) # Perform the migration (primary only) if zkhandler.read( "base.config.primary_node") == config["node_hostname"]: logger.out("Primary node acquiring exclusive lock", state="s") # Wait for things to settle sleep(0.5) # Acquire a write lock on the root key with zkhandler.exclusivelock("base.schema.version"): # Perform the schema migration tasks logger.out("Performing schema update", state="s") if new_schema_version > node_schema_version: zkhandler.schema.migrate(zkhandler, new_schema_version) if new_schema_version < node_schema_version: zkhandler.schema.rollback(zkhandler, new_schema_version) # Wait for the exclusive lock to be lifted else: logger.out("Non-primary node acquiring read lock", state="s") # Wait for things to settle sleep(1) # Wait for a read lock lock = zkhandler.readlock("base.schema.version") lock.acquire() # Wait a bit more for the primary to return to normal sleep(1) # Update the local schema version logger.out("Updating node target schema version", state="s") zkhandler.write([(("node.data.active_schema", config["node_hostname"]), new_schema_version)]) node_schema_version = new_schema_version # Restart the API daemons if applicable logger.out("Restarting services", state="s") common.run_os_command("systemctl restart pvcapid-worker.service") if zkhandler.read( "base.config.primary_node") == config["node_hostname"]: common.run_os_command("systemctl restart pvcapid.service") # Restart ourselves with the new schema logger.out("Reloading node daemon", state="s") try: zkhandler.disconnect(persistent=True) del zkhandler except Exception: pass os.execv(sys.argv[0], sys.argv) # Validate the schema pvcnoded.util.zookeeper.validate_schema(logger, zkhandler) # Define a cleanup function def cleanup(failure=False): nonlocal logger, zkhandler, keepalive_timer, d_domain logger.out("Terminating pvcnoded and cleaning up", state="s") # Set shutdown state in Zookeeper zkhandler.write([(("node.state.daemon", config["node_hostname"]), "shutdown")]) # Waiting for any flushes to complete logger.out("Waiting for any active flushes", state="s") try: if this_node is not None: while this_node.flush_thread is not None: sleep(0.5) except Exception: # We really don't care here, just proceed pass # Stop console logging on all VMs logger.out("Stopping domain console watchers", state="s") try: if d_domain is not None: for domain in d_domain: if d_domain[domain].getnode() == config["node_hostname"]: d_domain[domain].console_log_instance.stop() except Exception: pass # Force into secondary coordinator state if needed try: if this_node.router_state == "primary" and len(d_node) > 1: zkhandler.write([("base.config.primary_node", "none")]) logger.out("Waiting for primary migration", state="s") timeout = 240 count = 0 while this_node.router_state != "secondary" and count < timeout: sleep(0.5) count += 1 except Exception: pass # Stop keepalive thread try: pvcnoded.util.keepalive.stop_keepalive_timer( logger, keepalive_timer) logger.out("Performing final keepalive update", state="s") pvcnoded.util.keepalive.node_keepalive(logger, config, zkhandler, this_node) except Exception: pass # Set stop state in Zookeeper zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")]) # Forcibly terminate dnsmasq because it gets stuck sometimes common.run_os_command("killall dnsmasq") # Close the Zookeeper connection try: zkhandler.disconnect(persistent=True) del zkhandler except Exception: pass logger.out("Terminated pvc daemon", state="s") logger.terminate() if failure: retcode = 1 else: retcode = 0 os._exit(retcode) # Termination function def term(signum="", frame=""): cleanup(failure=False) # Hangup (logrotate) function def hup(signum="", frame=""): if config["file_logging"]: logger.hup() # Handle signals gracefully signal.signal(signal.SIGTERM, term) signal.signal(signal.SIGINT, term) signal.signal(signal.SIGQUIT, term) signal.signal(signal.SIGHUP, hup) # Set up this node in Zookeeper pvcnoded.util.zookeeper.setup_node(logger, config, zkhandler) # Check that the primary node key exists and create it with us as primary if not try: current_primary = zkhandler.read("base.config.primary_node") except Exception: current_primary = "none" if current_primary and current_primary != "none": logger.out( f"Current primary node is {logger.fmt_blue}{current_primary}{logger.fmt_end}", state="i", ) else: if config["daemon_mode"] == "coordinator": logger.out("No primary node found; setting us as primary", state="i") zkhandler.write([("base.config.primary_node", config["node_hostname"])]) # Ensure that IPMI is reachable and working if not pvcnoded.util.fencing.verify_ipmi(config["ipmi_hostname"], config["ipmi_username"], config["ipmi_password"]): logger.out( "Our IPMI is not reachable; fencing of this node will likely fail", state="w", ) # Validate libvirt if not pvcnoded.util.libvirt.validate_libvirtd(logger, config): cleanup(failure=True) # Set up NFT pvcnoded.util.networking.create_nft_configuration(logger, config) # Create our object dictionaries logger.out("Setting up objects", state="s") d_node = dict() node_list = list() d_network = dict() network_list = list() sriov_pf_list = list() d_sriov_vf = dict() sriov_vf_list = list() d_domain = dict() domain_list = list() d_osd = dict() osd_list = list() d_pool = dict() pool_list = list() d_volume = dict() volume_list = dict() if config["enable_networking"] and config["daemon_mode"] == "coordinator": # Create an instance of the DNS Aggregator and Metadata API if we're a coordinator dns_aggregator = DNSAggregatorInstance.DNSAggregatorInstance( config, logger) metadata_api = MetadataAPIInstance.MetadataAPIInstance( zkhandler, config, logger) else: dns_aggregator = None metadata_api = None # # Zookeeper watchers for objects # # Node objects @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.node")) def set_nodes(new_node_list): nonlocal d_node, node_list # Add missing nodes to list for node in [node for node in new_node_list if node not in node_list]: d_node[node] = NodeInstance.NodeInstance( node, config["node_hostname"], zkhandler, config, logger, d_node, d_network, d_domain, dns_aggregator, metadata_api, ) # Remove deleted nodes from list for node in [node for node in node_list if node not in new_node_list]: del d_node[node] node_list = new_node_list logger.out( f'{logger.fmt_blue}Node list:{logger.fmt_end} {" ".join(node_list)}', state="i", ) # Update node objects lists for node in d_node: d_node[node].update_node_list(d_node) # Create helpful alias for this node this_node = d_node[config["node_hostname"]] # Maintenance status @zkhandler.zk_conn.DataWatch( zkhandler.schema.path("base.config.maintenance")) def update_maintenance(_maintenance, stat): try: maintenance = bool(strtobool(_maintenance.decode("ascii"))) except Exception: maintenance = False this_node.maintenance = maintenance # Primary node @zkhandler.zk_conn.DataWatch( zkhandler.schema.path("base.config.primary_node")) def update_primary_node(new_primary, stat, event=""): try: new_primary = new_primary.decode("ascii") except AttributeError: new_primary = "none" key_version = stat.version # TODO: Move this to the Node structure if new_primary != this_node.primary_node: if config["daemon_mode"] == "coordinator": # We're a coordinator and there's no primary if new_primary == "none": if (this_node.daemon_state == "run" and this_node.router_state not in ["primary", "takeover", "relinquish"]): logger.out("Contending for primary coordinator state", state="i") # Acquire an exclusive lock on the primary_node key primary_lock = zkhandler.exclusivelock( "base.config.primary_node") try: # This lock times out after 0.4s, which is 0.1s less than the pre-takeover # timeout beow. This ensures a primary takeover will not deadlock against # a node which has failed the contention primary_lock.acquire(timeout=0.4) # Ensure that when we get the lock the versions are still consistent and # that another node hasn't already acquired the primary state (maybe we're # extremely slow to respond) if (key_version == zkhandler.zk_conn.get( zkhandler.schema.path( "base.config.primary_node"))[1].version ): # Set the primary to us logger.out( "Acquiring primary coordinator state", state="o") zkhandler.write([( "base.config.primary_node", config["node_hostname"], )]) # Cleanly release the lock primary_lock.release() # We timed out acquiring a lock, or failed to write, which means we failed the # contention and should just log that except Exception: logger.out( "Timed out contending for primary coordinator state", state="i", ) elif new_primary == config["node_hostname"]: if this_node.router_state == "secondary": # Wait for 0.5s to ensure other contentions time out, then take over sleep(0.5) zkhandler.write([( ("node.state.router", config["node_hostname"]), "takeover", )]) else: if this_node.router_state == "primary": # Wait for 0.5s to ensure other contentions time out, then relinquish sleep(0.5) zkhandler.write([( ("node.state.router", config["node_hostname"]), "relinquish", )]) else: zkhandler.write([(("node.state.router", config["node_hostname"]), "client")]) # TODO: Turn this into a function like the others for clarity for node in d_node: d_node[node].primary_node = new_primary if config["enable_networking"]: # Network objects @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.network")) def update_networks(new_network_list): nonlocal network_list, d_network # Add any missing networks to the list for network in [ network for network in new_network_list if network not in network_list ]: d_network[network] = VXNetworkInstance.VXNetworkInstance( network, zkhandler, config, logger, this_node, dns_aggregator) # TODO: Move this to the Network structure if (config["daemon_mode"] == "coordinator" and d_network[network].nettype == "managed"): try: dns_aggregator.add_network(d_network[network]) except Exception as e: logger.out( f"Failed to create DNS Aggregator for network {network}: {e}", state="w", ) # Start primary functionality if (this_node.router_state == "primary" and d_network[network].nettype == "managed"): d_network[network].createGateways() d_network[network].startDHCPServer() # Remove any missing networks from the list for network in [ network for network in network_list if network not in new_network_list ]: # TODO: Move this to the Network structure if d_network[network].nettype == "managed": # Stop primary functionality if this_node.router_state == "primary": d_network[network].stopDHCPServer() d_network[network].removeGateways() dns_aggregator.remove_network(d_network[network]) # Stop firewalling d_network[network].removeFirewall() # Delete the network d_network[network].removeNetwork() del d_network[network] # Update the new list network_list = new_network_list logger.out( f'{logger.fmt_blue}Network list:{logger.fmt_end} {" ".join(network_list)}', state="i", ) # Update node objects list for node in d_node: d_node[node].update_network_list(d_network) # Add the SR-IOV PFs and VFs to Zookeeper # These do not behave like the objects; they are not dynamic (the API cannot change them), and they # exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node # basis, under the Node configuration tree. # MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys if (config["enable_sriov"] and zkhandler.schema.schema.get("sriov_pf", None) is not None): vf_list = list() for device in config["sriov_device"]: pf = device["phy"] vfcount = device["vfcount"] if device.get("mtu", None) is None: mtu = 1500 else: mtu = device["mtu"] # Create the PF device in Zookeeper zkhandler.write([ ( ("node.sriov.pf", config["node_hostname"], "sriov_pf", pf), "", ), ( ( "node.sriov.pf", config["node_hostname"], "sriov_pf.mtu", pf, ), mtu, ), ( ( "node.sriov.pf", config["node_hostname"], "sriov_pf.vfcount", pf, ), vfcount, ), ]) # Append the device to the list of PFs sriov_pf_list.append(pf) # Get the list of VFs from `ip link show` vf_list = json.loads( common.run_os_command(f"ip --json link show {pf}") [1])[0].get("vfinfo_list", []) for vf in vf_list: # { # 'vf': 3, # 'link_type': 'ether', # 'address': '00:00:00:00:00:00', # 'broadcast': 'ff:ff:ff:ff:ff:ff', # 'vlan_list': [{'vlan': 101, 'qos': 2}], # 'rate': {'max_tx': 0, 'min_tx': 0}, # 'spoofchk': True, # 'link_state': 'auto', # 'trust': False, # 'query_rss_en': False # } vfphy = f'{pf}v{vf["vf"]}' # Get the PCIe bus information dev_pcie_path = None try: with open(f"/sys/class/net/{vfphy}/device/uevent" ) as vfh: dev_uevent = vfh.readlines() for line in dev_uevent: if re.match(r"^PCI_SLOT_NAME=.*", line): dev_pcie_path = line.rstrip().split("=")[-1] except FileNotFoundError: # Something must already be using the PCIe device pass # Add the VF to Zookeeper if it does not yet exist if not zkhandler.exists( ("node.sriov.vf", config["node_hostname"], "sriov_vf", vfphy)): if dev_pcie_path is not None: pcie_domain, pcie_bus, pcie_slot, pcie_function = re.split( r":|\.", dev_pcie_path) else: # We can't add the device - for some reason we can't get any information on its PCIe bus path, # so just ignore this one, and continue. # This shouldn't happen under any real circumstances, unless the admin tries to attach a non-existent # VF to a VM manually, then goes ahead and adds that VF to the system with the VM running. continue zkhandler.write([ ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf", vfphy, ), "", ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.pf", vfphy, ), pf, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.mtu", vfphy, ), mtu, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.mac", vfphy, ), vf["address"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.phy_mac", vfphy, ), vf["address"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config", vfphy, ), "", ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.vlan_id", vfphy, ), vf["vlan_list"][0].get("vlan", "0"), ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.vlan_qos", vfphy, ), vf["vlan_list"][0].get("qos", "0"), ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.tx_rate_min", vfphy, ), vf["rate"]["min_tx"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.tx_rate_max", vfphy, ), vf["rate"]["max_tx"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.spoof_check", vfphy, ), vf["spoofchk"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.link_state", vfphy, ), vf["link_state"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.trust", vfphy, ), vf["trust"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.config.query_rss", vfphy, ), vf["query_rss_en"], ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.pci", vfphy, ), "", ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.pci.domain", vfphy, ), pcie_domain, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.pci.bus", vfphy, ), pcie_bus, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.pci.slot", vfphy, ), pcie_slot, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.pci.function", vfphy, ), pcie_function, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.used", vfphy, ), False, ), ( ( "node.sriov.vf", config["node_hostname"], "sriov_vf.used_by", vfphy, ), "", ), ]) # Append the device to the list of VFs sriov_vf_list.append(vfphy) # Remove any obsolete PFs from Zookeeper if they go away for pf in zkhandler.children( ("node.sriov.pf", config["node_hostname"])): if pf not in sriov_pf_list: zkhandler.delete([("node.sriov.pf", config["node_hostname"], "sriov_pf", pf) ]) # Remove any obsolete VFs from Zookeeper if their PF goes away for vf in zkhandler.children( ("node.sriov.vf", config["node_hostname"])): vf_pf = zkhandler.read( ("node.sriov.vf", config["node_hostname"], "sriov_vf.pf", vf)) if vf_pf not in sriov_pf_list: zkhandler.delete([("node.sriov.vf", config["node_hostname"], "sriov_vf", vf) ]) # SR-IOV VF objects # This is a ChildrenWatch just for consistency; the list never changes at runtime @zkhandler.zk_conn.ChildrenWatch( zkhandler.schema.path("node.sriov.vf", config["node_hostname"])) def update_sriov_vfs(new_sriov_vf_list): nonlocal sriov_vf_list, d_sriov_vf # Add VFs to the list for vf in common.sortInterfaceNames(new_sriov_vf_list): d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance( vf, zkhandler, config, logger, this_node) sriov_vf_list = sorted(new_sriov_vf_list) logger.out( f'{logger.fmt_blue}SR-IOV VF list:{logger.fmt_end} {" ".join(sriov_vf_list)}', state="i", ) if config["enable_hypervisor"]: # VM command pipeline key @zkhandler.zk_conn.DataWatch(zkhandler.schema.path("base.cmd.domain")) def run_domain_command(data, stat, event=""): if data: VMInstance.vm_command(zkhandler, logger, this_node, data.decode("ascii")) # VM domain objects @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.domain")) def update_domains(new_domain_list): nonlocal domain_list, d_domain # Add missing domains to the list for domain in [ domain for domain in new_domain_list if domain not in domain_list ]: d_domain[domain] = VMInstance.VMInstance( domain, zkhandler, config, logger, this_node) # Remove any deleted domains from the list for domain in [ domain for domain in domain_list if domain not in new_domain_list ]: del d_domain[domain] # Update the new list domain_list = new_domain_list logger.out( f'{logger.fmt_blue}Domain list:{logger.fmt_end} {" ".join(domain_list)}', state="i", ) # Update node objects' list for node in d_node: d_node[node].update_domain_list(d_domain) if config["enable_storage"]: # Ceph command pipeline key @zkhandler.zk_conn.DataWatch(zkhandler.schema.path("base.cmd.ceph")) def run_ceph_command(data, stat, event=""): if data: CephInstance.ceph_command(zkhandler, logger, this_node, data.decode("ascii"), d_osd) # OSD objects @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.osd")) def update_osds(new_osd_list): nonlocal osd_list, d_osd # Add any missing OSDs to the list for osd in [osd for osd in new_osd_list if osd not in osd_list]: d_osd[osd] = CephInstance.CephOSDInstance( zkhandler, this_node, osd) # Remove any deleted OSDs from the list for osd in [osd for osd in osd_list if osd not in new_osd_list]: del d_osd[osd] # Update the new list osd_list = new_osd_list logger.out( f'{logger.fmt_blue}OSD list:{logger.fmt_end} {" ".join(osd_list)}', state="i", ) # Pool objects @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.pool")) def update_pools(new_pool_list): nonlocal pool_list, d_pool, volume_list, d_volume # Add any missing pools to the list for pool in [ pool for pool in new_pool_list if pool not in pool_list ]: d_pool[pool] = CephInstance.CephPoolInstance( zkhandler, this_node, pool) # Prepare the volume components for this pool volume_list[pool] = list() d_volume[pool] = dict() # Remove any deleted pools from the list for pool in [ pool for pool in pool_list if pool not in new_pool_list ]: del d_pool[pool] # Update the new list pool_list = new_pool_list logger.out( f'{logger.fmt_blue}Pool list:{logger.fmt_end} {" ".join(pool_list)}', state="i", ) # Volume objects (in each pool) for pool in pool_list: @zkhandler.zk_conn.ChildrenWatch( zkhandler.schema.path("volume", pool)) def update_volumes(new_volume_list): nonlocal volume_list, d_volume # Add any missing volumes to the list for volume in [ volume for volume in new_volume_list if volume not in volume_list[pool] ]: d_volume[pool][ volume] = CephInstance.CephVolumeInstance( zkhandler, this_node, pool, volume) # Remove any deleted volumes from the list for volume in [ volume for volume in volume_list[pool] if volume not in new_volume_list ]: del d_volume[pool][volume] # Update the new list volume_list[pool] = new_volume_list logger.out( f'{logger.fmt_blue}Volume list [{pool}]:{logger.fmt_end} {" ".join(volume_list[pool])}', state="i", ) # Start keepalived thread keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer( logger, config, zkhandler, this_node) # Tick loop; does nothing since everything is async while True: try: sleep(1) except Exception: break
def cleanup(failure=False): nonlocal logger, zkhandler, keepalive_timer, d_domain logger.out("Terminating pvcnoded and cleaning up", state="s") # Set shutdown state in Zookeeper zkhandler.write([(("node.state.daemon", config["node_hostname"]), "shutdown")]) # Waiting for any flushes to complete logger.out("Waiting for any active flushes", state="s") try: if this_node is not None: while this_node.flush_thread is not None: sleep(0.5) except Exception: # We really don't care here, just proceed pass # Stop console logging on all VMs logger.out("Stopping domain console watchers", state="s") try: if d_domain is not None: for domain in d_domain: if d_domain[domain].getnode() == config["node_hostname"]: d_domain[domain].console_log_instance.stop() except Exception: pass # Force into secondary coordinator state if needed try: if this_node.router_state == "primary" and len(d_node) > 1: zkhandler.write([("base.config.primary_node", "none")]) logger.out("Waiting for primary migration", state="s") timeout = 240 count = 0 while this_node.router_state != "secondary" and count < timeout: sleep(0.5) count += 1 except Exception: pass # Stop keepalive thread try: pvcnoded.util.keepalive.stop_keepalive_timer( logger, keepalive_timer) logger.out("Performing final keepalive update", state="s") pvcnoded.util.keepalive.node_keepalive(logger, config, zkhandler, this_node) except Exception: pass # Set stop state in Zookeeper zkhandler.write([(("node.state.daemon", config["node_hostname"]), "stop")]) # Forcibly terminate dnsmasq because it gets stuck sometimes common.run_os_command("killall dnsmasq") # Close the Zookeeper connection try: zkhandler.disconnect(persistent=True) del zkhandler except Exception: pass logger.out("Terminated pvc daemon", state="s") logger.terminate() if failure: retcode = 1 else: retcode = 0 os._exit(retcode)
def update_schema(new_schema_version, stat, event=""): nonlocal zkhandler, keepalive_timer, node_schema_version try: new_schema_version = int(new_schema_version.decode("ascii")) except Exception: new_schema_version = 0 if new_schema_version == node_schema_version: return True logger.out("Hot update of schema version started", state="s") logger.out( f"Current version: {node_schema_version,} New version: {new_schema_version}", state="s", ) # Prevent any keepalive updates while this happens if keepalive_timer is not None: pvcnoded.util.keepalive.stop_keepalive_timer( logger, keepalive_timer) sleep(1) # Perform the migration (primary only) if zkhandler.read( "base.config.primary_node") == config["node_hostname"]: logger.out("Primary node acquiring exclusive lock", state="s") # Wait for things to settle sleep(0.5) # Acquire a write lock on the root key with zkhandler.exclusivelock("base.schema.version"): # Perform the schema migration tasks logger.out("Performing schema update", state="s") if new_schema_version > node_schema_version: zkhandler.schema.migrate(zkhandler, new_schema_version) if new_schema_version < node_schema_version: zkhandler.schema.rollback(zkhandler, new_schema_version) # Wait for the exclusive lock to be lifted else: logger.out("Non-primary node acquiring read lock", state="s") # Wait for things to settle sleep(1) # Wait for a read lock lock = zkhandler.readlock("base.schema.version") lock.acquire() # Wait a bit more for the primary to return to normal sleep(1) # Update the local schema version logger.out("Updating node target schema version", state="s") zkhandler.write([(("node.data.active_schema", config["node_hostname"]), new_schema_version)]) node_schema_version = new_schema_version # Restart the API daemons if applicable logger.out("Restarting services", state="s") common.run_os_command("systemctl restart pvcapid-worker.service") if zkhandler.read( "base.config.primary_node") == config["node_hostname"]: common.run_os_command("systemctl restart pvcapid.service") # Restart ourselves with the new schema logger.out("Reloading node daemon", state="s") try: zkhandler.disconnect(persistent=True) del zkhandler except Exception: pass os.execv(sys.argv[0], sys.argv)
def get_configuration(): """ Parse the configuration of the node daemon. """ pvcnoded_config_file = get_configuration_path() print('Loading configuration from file "{}"'.format(pvcnoded_config_file)) with open(pvcnoded_config_file, "r") as cfgfile: try: o_config = yaml.load(cfgfile, Loader=yaml.SafeLoader) except Exception as e: print("ERROR: Failed to parse configuration file: {}".format(e)) os._exit(1) node_fqdn, node_hostname, node_domain, node_id = get_hostname() # Create the configuration dictionary config = dict() # Get the initial base configuration try: o_base = o_config["pvc"] o_cluster = o_config["pvc"]["cluster"] except Exception as e: raise MalformedConfigurationError(e) config_general = { "node": o_base.get("node", node_hostname), "node_hostname": node_hostname, "node_fqdn": node_fqdn, "node_domain": node_domain, "node_id": node_id, "coordinators": o_cluster.get("coordinators", list()), "debug": o_base.get("debug", False), } config = {**config, **config_general} # Get the functions configuration try: o_functions = o_config["pvc"]["functions"] except Exception as e: raise MalformedConfigurationError(e) config_functions = { "enable_hypervisor": o_functions.get("enable_hypervisor", False), "enable_networking": o_functions.get("enable_networking", False), "enable_storage": o_functions.get("enable_storage", False), "enable_api": o_functions.get("enable_api", False), } config = {**config, **config_functions} # Get the directory configuration try: o_directories = o_config["pvc"]["system"]["configuration"][ "directories"] except Exception as e: raise MalformedConfigurationError(e) config_directories = { "dynamic_directory": o_directories.get("dynamic_directory", None), "log_directory": o_directories.get("log_directory", None), "console_log_directory": o_directories.get("console_log_directory", None), } # Define our dynamic directory schema config_directories["dnsmasq_dynamic_directory"] = ( config_directories["dynamic_directory"] + "/dnsmasq") config_directories["pdns_dynamic_directory"] = ( config_directories["dynamic_directory"] + "/pdns") config_directories["nft_dynamic_directory"] = ( config_directories["dynamic_directory"] + "/nft") # Define our log directory schema config_directories["dnsmasq_log_directory"] = ( config_directories["log_directory"] + "/dnsmasq") config_directories["pdns_log_directory"] = ( config_directories["log_directory"] + "/pdns") config_directories["nft_log_directory"] = ( config_directories["log_directory"] + "/nft") config = {**config, **config_directories} # Get the logging configuration try: o_logging = o_config["pvc"]["system"]["configuration"]["logging"] except Exception as e: raise MalformedConfigurationError(e) config_logging = { "file_logging": o_logging.get("file_logging", False), "stdout_logging": o_logging.get("stdout_logging", False), "zookeeper_logging": o_logging.get("zookeeper_logging", False), "log_colours": o_logging.get("log_colours", False), "log_dates": o_logging.get("log_dates", False), "log_keepalives": o_logging.get("log_keepalives", False), "log_keepalive_cluster_details": o_logging.get("log_keepalive_cluster_details", False), "log_keepalive_storage_details": o_logging.get("log_keepalive_storage_details", False), "console_log_lines": o_logging.get("console_log_lines", False), "node_log_lines": o_logging.get("node_log_lines", False), } config = {**config, **config_logging} # Get the interval configuration try: o_intervals = o_config["pvc"]["system"]["intervals"] except Exception as e: raise MalformedConfigurationError(e) config_intervals = { "vm_shutdown_timeout": int(o_intervals.get("vm_shutdown_timeout", 60)), "keepalive_interval": int(o_intervals.get("keepalive_interval", 5)), "fence_intervals": int(o_intervals.get("fence_intervals", 6)), "suicide_intervals": int(o_intervals.get("suicide_interval", 0)), } config = {**config, **config_intervals} # Get the fencing configuration try: o_fencing = o_config["pvc"]["system"]["fencing"] o_fencing_actions = o_fencing["actions"] o_fencing_ipmi = o_fencing["ipmi"] except Exception as e: raise MalformedConfigurationError(e) config_fencing = { "successful_fence": o_fencing_actions.get("successful_fence", None), "failed_fence": o_fencing_actions.get("failed_fence", None), "ipmi_hostname": o_fencing_ipmi.get("host", f"{node_hostname}-lom.{node_domain}"), "ipmi_username": o_fencing_ipmi.get("user", "null"), "ipmi_password": o_fencing_ipmi.get("pass", "null"), } config = {**config, **config_fencing} # Get the migration configuration try: o_migration = o_config["pvc"]["system"]["migration"] except Exception as e: raise MalformedConfigurationError(e) config_migration = { "migration_target_selector": o_migration.get("target_selector", "mem"), } config = {**config, **config_migration} if config["enable_networking"]: # Get the node networks configuration try: o_networks = o_config["pvc"]["cluster"]["networks"] o_network_cluster = o_networks["cluster"] o_network_storage = o_networks["storage"] o_network_upstream = o_networks["upstream"] o_sysnetworks = o_config["pvc"]["system"]["configuration"][ "networking"] o_sysnetwork_cluster = o_sysnetworks["cluster"] o_sysnetwork_storage = o_sysnetworks["storage"] o_sysnetwork_upstream = o_sysnetworks["upstream"] except Exception as e: raise MalformedConfigurationError(e) config_networks = { "cluster_domain": o_network_cluster.get("domain", None), "cluster_network": o_network_cluster.get("network", None), "cluster_floating_ip": o_network_cluster.get("floating_ip", None), "cluster_dev": o_sysnetwork_cluster.get("device", None), "cluster_mtu": o_sysnetwork_cluster.get("mtu", None), "cluster_dev_ip": o_sysnetwork_cluster.get("address", None), "storage_domain": o_network_storage.get("domain", None), "storage_network": o_network_storage.get("network", None), "storage_floating_ip": o_network_storage.get("floating_ip", None), "storage_dev": o_sysnetwork_storage.get("device", None), "storage_mtu": o_sysnetwork_storage.get("mtu", None), "storage_dev_ip": o_sysnetwork_storage.get("address", None), "upstream_domain": o_network_upstream.get("domain", None), "upstream_network": o_network_upstream.get("network", None), "upstream_floating_ip": o_network_upstream.get("floating_ip", None), "upstream_gateway": o_network_upstream.get("gateway", None), "upstream_dev": o_sysnetwork_upstream.get("device", None), "upstream_mtu": o_sysnetwork_upstream.get("mtu", None), "upstream_dev_ip": o_sysnetwork_upstream.get("address", None), "bridge_dev": o_sysnetworks.get("bridge_device", None), "bridge_mtu": o_sysnetworks.get("bridge_mtu", None), "enable_sriov": o_sysnetworks.get("sriov_enable", False), "sriov_device": o_sysnetworks.get("sriov_device", list()), } if config_networks["bridge_mtu"] is None: # Read the current MTU of bridge_dev and set bridge_mtu to it; avoids weird resets retcode, stdout, stderr = common.run_os_command( f"ip -json link show dev {config_networks['bridge_dev']}") current_bridge_mtu = loads(stdout)[0]["mtu"] print( f"Config key bridge_mtu not explicitly set; using live MTU {current_bridge_mtu} from {config_networks['bridge_dev']}" ) config_networks["bridge_mtu"] = current_bridge_mtu config = {**config, **config_networks} for network_type in ["cluster", "storage", "upstream"]: result, msg = validate_floating_ip(config, network_type) if not result: raise MalformedConfigurationError(msg) address_key = "{}_dev_ip".format(network_type) network_key = f"{network_type}_network" network = ip_network(config[network_key]) # With autoselection of addresses, construct an IP from the relevant network if config[address_key] == "by-id": # The NodeID starts at 1, but indexes start at 0 address_id = int(config["node_id"]) - 1 # Grab the nth address from the network config[address_key] = "{}/{}".format( list(network.hosts())[address_id], network.prefixlen) # Validate the provided IP instead else: try: address = ip_address(config[address_key].split("/")[0]) if address not in list(network.hosts()): raise except Exception: raise MalformedConfigurationError( f"IP address {config[address_key]} for {address_key} is not valid" ) # Get the PowerDNS aggregator database configuration try: o_pdnsdb = o_config["pvc"]["coordinator"]["dns"]["database"] except Exception as e: raise MalformedConfigurationError(e) config_pdnsdb = { "pdns_postgresql_host": o_pdnsdb.get("host", None), "pdns_postgresql_port": o_pdnsdb.get("port", None), "pdns_postgresql_dbname": o_pdnsdb.get("name", None), "pdns_postgresql_user": o_pdnsdb.get("user", None), "pdns_postgresql_password": o_pdnsdb.get("pass", None), } config = {**config, **config_pdnsdb} # Get the Cloud-Init Metadata database configuration try: o_metadatadb = o_config["pvc"]["coordinator"]["metadata"][ "database"] except Exception as e: raise MalformedConfigurationError(e) config_metadatadb = { "metadata_postgresql_host": o_metadatadb.get("host", None), "metadata_postgresql_port": o_metadatadb.get("port", None), "metadata_postgresql_dbname": o_metadatadb.get("name", None), "metadata_postgresql_user": o_metadatadb.get("user", None), "metadata_postgresql_password": o_metadatadb.get("pass", None), } config = {**config, **config_metadatadb} if config["enable_storage"]: # Get the storage configuration try: o_storage = o_config["pvc"]["system"]["configuration"]["storage"] except Exception as e: raise MalformedConfigurationError(e) config_storage = { "ceph_config_file": o_storage.get("ceph_config_file", None), "ceph_admin_keyring": o_storage.get("ceph_admin_keyring", None), } config = {**config, **config_storage} # Add our node static data to the config config["static_data"] = get_static_data() return config