Exemplo n.º 1
0
def add_volume(zkhandler, pool, name, size):
    # Add 'B' if the volume is in bytes
    if re.match(r"^[0-9]+$", size):
        size = "{}B".format(size)

    # 1. Verify the size of the volume
    pool_information = getPoolInformation(zkhandler, pool)
    size_bytes = format_bytes_fromhuman(size)
    if size_bytes >= int(pool_information["stats"]["free_bytes"]):
        return (
            False,
            "ERROR: Requested volume size is greater than the available free space in the pool",
        )

    # 2. Create the volume
    retcode, stdout, stderr = common.run_os_command(
        "rbd create --size {} {}/{}".format(size, pool, name))
    if retcode:
        return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(
            name, stderr)

    # 2. Get volume stats
    retcode, stdout, stderr = common.run_os_command(
        "rbd info --format json {}/{}".format(pool, name))
    volstats = stdout

    # 3. Add the new volume to Zookeeper
    zkhandler.write([
        (("volume", f"{pool}/{name}"), ""),
        (("volume.stats", f"{pool}/{name}"), volstats),
        (("snapshot", f"{pool}/{name}"), ""),
    ])

    return True, 'Created RBD volume "{}/{}" ({}).'.format(pool, name, size)
Exemplo n.º 2
0
def clone_volume(zkhandler, pool, name_src, name_new):
    if not verifyVolume(zkhandler, pool, name_src):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            name_src, pool)

    # 1. Clone the volume
    retcode, stdout, stderr = common.run_os_command(
        "rbd copy {}/{} {}/{}".format(pool, name_src, pool, name_new))
    if retcode:
        return (
            False,
            'ERROR: Failed to clone RBD volume "{}" to "{}" in pool "{}": {}'.
            format(name_src, name_new, pool, stderr),
        )

    # 2. Get volume stats
    retcode, stdout, stderr = common.run_os_command(
        "rbd info --format json {}/{}".format(pool, name_new))
    volstats = stdout

    # 3. Add the new volume to Zookeeper
    zkhandler.write([
        (("volume", f"{pool}/{name_new}"), ""),
        (("volume.stats", f"{pool}/{name_new}"), volstats),
        (("snapshot", f"{pool}/{name_new}"), ""),
    ])

    return True, 'Cloned RBD volume "{}" to "{}" in pool "{}"'.format(
        name_src, name_new, pool)
Exemplo n.º 3
0
    def createNetworkBridged(self):
        self.logger.out(
            "Creating bridged vLAN device {} on interface {}".format(
                self.base_nic, self.bridge_dev),
            prefix="VNI {}".format(self.vni),
            state="i",
        )

        # Create vLAN interface
        common.run_os_command(
            "ip link add link {} name {} type vlan id {}".format(
                self.bridge_dev, self.base_nic, self.vni))
        # Create bridge interface
        common.run_os_command("brctl addbr {}".format(self.bridge_nic))

        self.updateNetworkMTU()

        # Disable tx checksum offload on bridge interface (breaks DHCP on Debian < 9)
        common.run_os_command("ethtool -K {} tx off".format(self.bridge_nic))

        # Disable IPv6 on bridge interface (prevents leakage)
        common.run_os_command("sysctl net.ipv6.conf.{}.disable_ipv6=1".format(
            self.bridge_nic))

        # Add vLAN interface to bridge interface
        common.run_os_command("brctl addif {} {}".format(
            self.bridge_nic, self.base_nic))
Exemplo n.º 4
0
    def createNetworkManaged(self):
        self.logger.out(
            "Creating VXLAN device on interface {}".format(self.cluster_dev),
            prefix="VNI {}".format(self.vni),
            state="i",
        )

        # Create VXLAN interface
        common.run_os_command(
            "ip link add {} type vxlan id {} dstport 4789 dev {}".format(
                self.base_nic, self.vni, self.cluster_dev))
        # Create bridge interface
        common.run_os_command("brctl addbr {}".format(self.bridge_nic))

        self.updateNetworkMTU()

        # Disable tx checksum offload on bridge interface (breaks DHCP on Debian < 9)
        common.run_os_command("ethtool -K {} tx off".format(self.bridge_nic))

        # Disable IPv6 DAD on bridge interface
        common.run_os_command("sysctl net.ipv6.conf.{}.accept_dad=0".format(
            self.bridge_nic))

        # Add VXLAN interface to bridge interface
        common.run_os_command("brctl addif {} {}".format(
            self.bridge_nic, self.base_nic))
Exemplo n.º 5
0
def set_pgs_pool(zkhandler, name, pgs):
    if not verifyPool(zkhandler, name):
        return False, f'ERROR: No pool with name "{name}" is present in the cluster.'

    # Validate new PGs count
    pgs = int(pgs)
    if (pgs == 0) or (pgs & (pgs - 1) != 0):
        return (
            False,
            f'ERROR: Invalid PGs number "{pgs}": must be a non-zero power of 2.',
        )

    # Set the new pgs number
    retcode, stdout, stderr = common.run_os_command(
        f"ceph osd pool set {name} pg_num {pgs}")
    if retcode:
        return False, f"ERROR: Failed to set pg_num on pool {name} to {pgs}: {stderr}"

    # Set the new pgps number if increasing
    current_pgs = int(zkhandler.read(("pool.pgs", name)))
    if current_pgs >= pgs:
        retcode, stdout, stderr = common.run_os_command(
            f"ceph osd pool set {name} pgp_num {pgs}")
        if retcode:
            return (
                False,
                f"ERROR: Failed to set pg_num on pool {name} to {pgs}: {stderr}",
            )

    # Update Zookeeper count
    zkhandler.write([
        (("pool.pgs", name), pgs),
    ])

    return True, f'Set PGs count to {pgs} for RBD pool "{name}".'
Exemplo n.º 6
0
def rename_volume(zkhandler, pool, name, new_name):
    if not verifyVolume(zkhandler, pool, name):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            name, pool)

    # 1. Rename the volume
    retcode, stdout, stderr = common.run_os_command(
        "rbd rename {}/{} {}".format(pool, name, new_name))
    if retcode:
        return (
            False,
            'ERROR: Failed to rename volume "{}" to "{}" in pool "{}": {}'.
            format(name, new_name, pool, stderr),
        )

    # 2. Rename the volume in Zookeeper
    zkhandler.rename([
        (("volume", f"{pool}/{name}"), ("volume", f"{pool}/{new_name}")),
        (("snapshot", f"{pool}/{name}"), ("snapshot", f"{pool}/{new_name}")),
    ])

    # 3. Get volume stats
    retcode, stdout, stderr = common.run_os_command(
        "rbd info --format json {}/{}".format(pool, new_name))
    volstats = stdout

    # 4. Update the volume stats in Zookeeper
    zkhandler.write([
        (("volume.stats", f"{pool}/{new_name}"), volstats),
    ])

    return True, 'Renamed RBD volume "{}" to "{}" in pool "{}".'.format(
        name, new_name, pool)
Exemplo n.º 7
0
 def updateNetworkMTU(self):
     self.logger.out(
         "Setting network MTU to {}".format(self.vx_mtu),
         prefix="VNI {}".format(self.vni),
         state="i",
     )
     # Set MTU of base and bridge NICs
     common.run_os_command("ip link set {} mtu {} up".format(
         self.base_nic, self.vx_mtu))
     common.run_os_command("ip link set {} mtu {} up".format(
         self.bridge_nic, self.vx_mtu))
Exemplo n.º 8
0
 def removeNetworkManaged(self):
     self.logger.out(
         "Removing VNI device on interface {}".format(self.cluster_dev),
         prefix="VNI {}".format(self.vni),
         state="i",
     )
     common.run_os_command("ip link set {} down".format(self.bridge_nic))
     common.run_os_command("ip link set {} down".format(self.base_nic))
     common.run_os_command("brctl delif {} {}".format(
         self.bridge_nic, self.base_nic))
     common.run_os_command("brctl delbr {}".format(self.bridge_nic))
     common.run_os_command("ip link delete {}".format(self.base_nic))
Exemplo n.º 9
0
    def create_osd_db_lv(zkhandler, logger, osd_id, ext_db_ratio,
                         osd_size_bytes):
        logger.out(
            "Creating new OSD database logical volume for OSD ID {}".format(
                osd_id),
            state="i",
        )
        try:
            # 0. Check if an existsing logical volume exists
            retcode, stdout, stderr = common.run_os_command(
                "lvdisplay osd-db/osd{}".format(osd_id))
            if retcode != 5:
                logger.out(
                    'Ceph OSD database LV "osd-db/osd{}" already exists'.
                    format(osd_id),
                    state="e",
                )
                return False

            # 1. Determine LV sizing
            osd_db_size = int(osd_size_bytes * ext_db_ratio / 1024 / 1024)

            # 2. Create the LV
            logger.out(
                'Creating DB LV "osd-db/osd-{}" of {}M ({} * {})'.format(
                    osd_id, osd_db_size, osd_size_bytes, ext_db_ratio),
                state="i",
            )
            retcode, stdout, stderr = common.run_os_command(
                "lvcreate --yes --name osd-{} --size {} osd-db".format(
                    osd_id, osd_db_size))
            if retcode:
                print("db lv creation")
                print(stdout)
                print(stderr)
                raise Exception

            # Log it
            logger.out(
                'Created new OSD database logical volume "osd-db/osd-{}"'.
                format(osd_id),
                state="o",
            )
            return True
        except Exception as e:
            # Log it
            logger.out(
                "Failed to create OSD database logical volume: {}".format(e),
                state="e")
            return False
Exemplo n.º 10
0
def add_snapshot(zkhandler, pool, volume, name):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            volume, pool)

    # 1. Create the snapshot
    retcode, stdout, stderr = common.run_os_command(
        "rbd snap create {}/{}@{}".format(pool, volume, name))
    if retcode:
        return (
            False,
            'ERROR: Failed to create RBD snapshot "{}" of volume "{}" in pool "{}": {}'
            .format(name, volume, pool, stderr),
        )

    # 2. Add the snapshot to Zookeeper
    zkhandler.write([
        (("snapshot", f"{pool}/{volume}/{name}"), ""),
        (("snapshot.stats", f"{pool}/{volume}/{name}"), "{}"),
    ])

    # 3. Update the count of snapshots on this volume
    volume_stats_raw = zkhandler.read(("volume.stats", f"{pool}/{volume}"))
    volume_stats = dict(json.loads(volume_stats_raw))
    # Format the size to something nicer
    volume_stats["snapshot_count"] = volume_stats["snapshot_count"] + 1
    volume_stats_raw = json.dumps(volume_stats)
    zkhandler.write([
        (("volume.stats", f"{pool}/{volume}"), volume_stats_raw),
    ])

    return True, 'Created RBD snapshot "{}" of volume "{}" in pool "{}".'.format(
        name, volume, pool)
Exemplo n.º 11
0
def remove_snapshot(zkhandler, pool, volume, name):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            volume, pool)
    if not verifySnapshot(zkhandler, pool, volume, name):
        return (
            False,
            'ERROR: No snapshot with name "{}" is present of volume {} in pool {}.'
            .format(name, volume, pool),
        )

    # 1. Remove the snapshot
    retcode, stdout, stderr = common.run_os_command(
        "rbd snap rm {}/{}@{}".format(pool, volume, name))
    if retcode:
        return (
            False,
            'Failed to remove RBD snapshot "{}" of volume "{}" in pool "{}": {}'
            .format(name, volume, pool, stderr),
        )

    # 2. Delete snapshot from Zookeeper
    zkhandler.delete([("snapshot", f"{pool}/{volume}/{name}")])

    # 3. Update the count of snapshots on this volume
    volume_stats_raw = zkhandler.read(("volume.stats", f"{pool}/{volume}"))
    volume_stats = dict(json.loads(volume_stats_raw))
    # Format the size to something nicer
    volume_stats["snapshot_count"] = volume_stats["snapshot_count"] - 1
    volume_stats_raw = json.dumps(volume_stats)
    zkhandler.write([(("volume.stats", f"{pool}/{volume}"), volume_stats_raw)])

    return True, 'Removed RBD snapshot "{}" of volume "{}" in pool "{}".'.format(
        name, volume, pool)
Exemplo n.º 12
0
def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password):
    ipmi_command = f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
    retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2)
    if retcode == 0 and stdout.strip() == "Chassis Power is on":
        return True
    else:
        return False
Exemplo n.º 13
0
def remove_pool(zkhandler, name):
    if not verifyPool(zkhandler, name):
        return False, 'ERROR: No pool with name "{}" is present in the cluster.'.format(
            name)

    # 1. Remove pool volumes
    for volume in zkhandler.children(("volume", name)):
        remove_volume(zkhandler, name, volume)

    # 2. Remove the pool
    retcode, stdout, stderr = common.run_os_command(
        "ceph osd pool rm {pool} {pool} --yes-i-really-really-mean-it".format(
            pool=name))
    if retcode:
        return False, 'ERROR: Failed to remove pool "{}": {}'.format(
            name, stderr)

    # 3. Delete pool from Zookeeper
    zkhandler.delete([
        ("pool", name),
        ("volume", name),
        ("snapshot", name),
    ])

    return True, 'Removed RBD pool "{}" and all volumes.'.format(name)
Exemplo n.º 14
0
def rename_snapshot(zkhandler, pool, volume, name, new_name):
    if not verifyVolume(zkhandler, pool, volume):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            volume, pool)
    if not verifySnapshot(zkhandler, pool, volume, name):
        return (
            False,
            'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".'
            .format(name, volume, pool),
        )

    # 1. Rename the snapshot
    retcode, stdout, stderr = common.run_os_command(
        "rbd snap rename {pool}/{volume}@{name} {pool}/{volume}@{new_name}".
        format(pool=pool, volume=volume, name=name, new_name=new_name))
    if retcode:
        return (
            False,
            'ERROR: Failed to rename RBD snapshot "{}" to "{}" for volume "{}" in pool "{}": {}'
            .format(name, new_name, volume, pool, stderr),
        )

    # 2. Rename the snapshot in ZK
    zkhandler.rename([
        (
            ("snapshot", f"{pool}/{volume}/{name}"),
            ("snapshot", f"{pool}/{volume}/{new_name}"),
        ),
    ])

    return (
        True,
        'Renamed RBD snapshot "{}" to "{}" for volume "{}" in pool "{}".'.
        format(name, new_name, volume, pool),
    )
Exemplo n.º 15
0
def setup_sriov(logger, config):
    logger.out("Setting up SR-IOV device support", state="i")

    # Enable unsafe interrupts for the vfio_iommu_type1 kernel module
    try:
        common.run_os_command(
            "modprobe vfio_iommu_type1 allow_unsafe_interrupts=1")
        with open(
                "/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts",
                "w") as mfh:
            mfh.write("Y")
    except Exception:
        logger.out(
            "Failed to enable vfio_iommu_type1 kernel module; SR-IOV may fail",
            state="w",
        )

    # Loop through our SR-IOV NICs and enable the numvfs for each
    for device in config["sriov_device"]:
        logger.out(
            f'Preparing SR-IOV PF {device["phy"]} with {device["vfcount"]} VFs',
            state="i",
        )
        try:
            with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs',
                      "r") as vfh:
                current_vf_count = vfh.read().strip()
            with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs',
                      "w") as vfh:
                vfh.write(str(device["vfcount"]))
        except FileNotFoundError:
            logger.out(
                f'Failed to open SR-IOV configuration for PF {device["phy"]}; device may not support SR-IOV',
                state="w",
            )
        except OSError:
            logger.out(
                f'Failed to set SR-IOV VF count for PF {device["phy"]} to {device["vfcount"]}; already set to {current_vf_count}',
                state="w",
            )

        if device.get("mtu", None) is not None:
            logger.out(
                f'Setting SR-IOV PF {device["phy"]} to MTU {device["mtu"]}',
                state="i")
            common.run_os_command(
                f'ip link set {device["phy"]} mtu {device["mtu"]} up')
Exemplo n.º 16
0
def unset_osd(zkhandler, option):
    retcode, stdout, stderr = common.run_os_command(
        "ceph osd unset {}".format(option))
    if retcode:
        return False, 'ERROR: Failed to unset property "{}": {}'.format(
            option, stderr)

    return True, 'Unset OSD property "{}".'.format(option)
Exemplo n.º 17
0
        def watch_vf_trust(data, stat, event=""):
            if event and event.type == "DELETED":
                # The key has been deleted after existing before; terminate this watcher
                # because this class instance is about to be reaped in Daemon.py
                return False

            try:
                data = data.decode("ascii")
            except AttributeError:
                data = "off"

            if data != self.trust:
                self.trust = data
                self.logger.out(
                    "Setting trust mode {}".format(boolToOnOff(self.trust)),
                    state="i",
                    prefix="SR-IOV VF {}".format(self.vf),
                )
                common.run_os_command("ip link set {} vf {} trust {}".format(
                    self.pf, self.vfid, boolToOnOff(self.trust)))
Exemplo n.º 18
0
        def watch_vf_vlan_id(data, stat, event=""):
            if event and event.type == "DELETED":
                # The key has been deleted after existing before; terminate this watcher
                # because this class instance is about to be reaped in Daemon.py
                return False

            try:
                data = data.decode("ascii")
            except AttributeError:
                data = "0"

            if data != self.vlan_id:
                self.vlan_id = data
                self.logger.out(
                    "Setting vLAN ID to {}".format(self.vlan_id),
                    state="i",
                    prefix="SR-IOV VF {}".format(self.vf),
                )
                common.run_os_command(
                    "ip link set {} vf {} vlan {} qos {}".format(
                        self.pf, self.vfid, self.vlan_id, self.vlan_qos))
Exemplo n.º 19
0
def out_osd(zkhandler, osd_id):
    if not verifyOSD(zkhandler, osd_id):
        return False, 'ERROR: No OSD with ID "{}" is present in the cluster.'.format(
            osd_id)

    retcode, stdout, stderr = common.run_os_command(
        "ceph osd out {}".format(osd_id))
    if retcode:
        return False, "ERROR: Failed to disable OSD {}: {}".format(
            osd_id, stderr)

    return True, "Set OSD {} offline.".format(osd_id)
Exemplo n.º 20
0
def remove_volume(zkhandler, pool, name):
    if not verifyVolume(zkhandler, pool, name):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            name, pool)

    # 1. Remove volume snapshots
    for snapshot in zkhandler.children(("snapshot", f"{pool}/{name}")):
        remove_snapshot(zkhandler, pool, name, snapshot)

    # 2. Remove the volume
    retcode, stdout, stderr = common.run_os_command("rbd rm {}/{}".format(
        pool, name))
    if retcode:
        return False, 'ERROR: Failed to remove RBD volume "{}" in pool "{}": {}'.format(
            name, pool, stderr)

    # 3. Delete volume from Zookeeper
    zkhandler.delete([
        ("volume", f"{pool}/{name}"),
        ("snapshot", f"{pool}/{name}"),
    ])

    return True, 'Removed RBD volume "{}" in pool "{}".'.format(name, pool)
Exemplo n.º 21
0
def unmap_volume(zkhandler, pool, name):
    if not verifyVolume(zkhandler, pool, name):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            name, pool)

    mapped_volume = "/dev/rbd/{}/{}".format(pool, name)

    # 1. Ensure the volume exists
    if not os.path.exists(mapped_volume):
        return (
            False,
            'ERROR: Mapped volume not found at expected location "{}".'.format(
                mapped_volume),
        )

    # 2. Unap the volume
    retcode, stdout, stderr = common.run_os_command(
        "rbd unmap {}".format(mapped_volume))
    if retcode:
        return False, 'ERROR: Failed to unmap RBD volume at "{}": {}'.format(
            mapped_volume, stderr)

    return True, 'Unmapped RBD volume at "{}".'.format(mapped_volume)
Exemplo n.º 22
0
def map_volume(zkhandler, pool, name):
    if not verifyVolume(zkhandler, pool, name):
        return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
            name, pool)

    # 1. Map the volume onto the local system
    retcode, stdout, stderr = common.run_os_command("rbd map {}/{}".format(
        pool, name))
    if retcode:
        return False, 'ERROR: Failed to map RBD volume "{}" in pool "{}": {}'.format(
            name, pool, stderr)

    # 2. Calculate the absolute path to the mapped volume
    mapped_volume = "/dev/rbd/{}/{}".format(pool, name)

    # 3. Ensure the volume exists
    if not os.path.exists(mapped_volume):
        return (
            False,
            'ERROR: Mapped volume not found at expected location "{}".'.format(
                mapped_volume),
        )

    return True, mapped_volume
Exemplo n.º 23
0
def ceph_volume_upload(zkhandler, pool, volume, img_type):
    """
    Upload a raw file via HTTP post to a PVC Ceph volume
    """
    # Determine the image conversion options
    if img_type not in ["raw", "vmdk", "qcow2", "qed", "vdi", "vpc"]:
        output = {"message": "Image type '{}' is not valid.".format(img_type)}
        retcode = 400
        return output, retcode

    # Get the size of the target block device
    retcode, retdata = pvc_ceph.get_list_volume(zkhandler,
                                                pool,
                                                volume,
                                                is_fuzzy=False)
    # If there's no target, return failure
    if not retcode or len(retdata) < 1:
        output = {
            "message":
            "Target volume '{}' does not exist in pool '{}'.".format(
                volume, pool)
        }
        retcode = 400
        return output, retcode
    dev_size = retdata[0]["stats"]["size"]

    def cleanup_maps_and_volumes():
        # Unmap the target blockdev
        retflag, retdata = pvc_ceph.unmap_volume(zkhandler, pool, volume)
        # Unmap the temporary blockdev
        retflag, retdata = pvc_ceph.unmap_volume(zkhandler, pool,
                                                 "{}_tmp".format(volume))
        # Remove the temporary blockdev
        retflag, retdata = pvc_ceph.remove_volume(zkhandler, pool,
                                                  "{}_tmp".format(volume))

    # Create a temporary block device to store non-raw images
    if img_type == "raw":
        # Map the target blockdev
        retflag, retdata = pvc_ceph.map_volume(zkhandler, pool, volume)
        if not retflag:
            output = {"message": retdata.replace('"', "'")}
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode
        dest_blockdev = retdata

        # Save the data to the blockdev directly
        try:
            # This sets up a custom stream_factory that writes directly into the ova_blockdev,
            # rather than the standard stream_factory which writes to a temporary file waiting
            # on a save() call. This will break if the API ever uploaded multiple files, but
            # this is an acceptable workaround.
            def image_stream_factory(total_content_length,
                                     filename,
                                     content_type,
                                     content_length=None):
                return open(dest_blockdev, "wb")

            parse_form_data(flask.request.environ,
                            stream_factory=image_stream_factory)
        except Exception:
            output = {
                "message":
                "Failed to upload or write image file to temporary volume."
            }
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode

        output = {
            "message":
            "Wrote uploaded file to volume '{}' in pool '{}'.".format(
                volume, pool)
        }
        retcode = 200
        cleanup_maps_and_volumes()
        return output, retcode

    # Write the image directly to the blockdev
    else:
        # Create a temporary blockdev
        retflag, retdata = pvc_ceph.add_volume(zkhandler, pool,
                                               "{}_tmp".format(volume),
                                               dev_size)
        if not retflag:
            output = {"message": retdata.replace('"', "'")}
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode

        # Map the temporary target blockdev
        retflag, retdata = pvc_ceph.map_volume(zkhandler, pool,
                                               "{}_tmp".format(volume))
        if not retflag:
            output = {"message": retdata.replace('"', "'")}
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode
        temp_blockdev = retdata

        # Map the target blockdev
        retflag, retdata = pvc_ceph.map_volume(zkhandler, pool, volume)
        if not retflag:
            output = {"message": retdata.replace('"', "'")}
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode
        dest_blockdev = retdata

        # Save the data to the temporary blockdev directly
        try:
            # This sets up a custom stream_factory that writes directly into the ova_blockdev,
            # rather than the standard stream_factory which writes to a temporary file waiting
            # on a save() call. This will break if the API ever uploaded multiple files, but
            # this is an acceptable workaround.
            def image_stream_factory(total_content_length,
                                     filename,
                                     content_type,
                                     content_length=None):
                return open(temp_blockdev, "wb")

            parse_form_data(flask.request.environ,
                            stream_factory=image_stream_factory)
        except Exception:
            output = {
                "message":
                "Failed to upload or write image file to temporary volume."
            }
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode

        # Convert from the temporary to destination format on the blockdevs
        retcode, stdout, stderr = pvc_common.run_os_command(
            "qemu-img convert -C -f {} -O raw {} {}".format(
                img_type, temp_blockdev, dest_blockdev))
        if retcode:
            output = {
                "message":
                "Failed to convert image format from '{}' to 'raw': {}".format(
                    img_type, stderr)
            }
            retcode = 400
            cleanup_maps_and_volumes()
            return output, retcode

        output = {
            "message":
            "Converted and wrote uploaded file to volume '{}' in pool '{}'.".
            format(volume, pool)
        }
        retcode = 200
        cleanup_maps_and_volumes()
        return output, retcode
Exemplo n.º 24
0
def setup_interfaces(logger, config):
    # Set up the Cluster interface
    cluster_dev = config["cluster_dev"]
    cluster_mtu = config["cluster_mtu"]
    cluster_dev_ip = config["cluster_dev_ip"]

    logger.out(
        f"Setting up Cluster network interface {cluster_dev} with MTU {cluster_mtu}",
        state="i",
    )

    common.run_os_command(f"ip link set {cluster_dev} mtu {cluster_mtu} up")

    logger.out(
        f"Setting up Cluster network bridge on interface {cluster_dev} with IP {cluster_dev_ip}",
        state="i",
    )

    common.run_os_command("brctl addbr brcluster")
    common.run_os_command(f"brctl addif brcluster {cluster_dev}")
    common.run_os_command(f"ip link set brcluster mtu {cluster_mtu} up")
    common.run_os_command(f"ip address add {cluster_dev_ip} dev brcluster")

    # Set up the Storage interface
    storage_dev = config["storage_dev"]
    storage_mtu = config["storage_mtu"]
    storage_dev_ip = config["storage_dev_ip"]

    logger.out(
        f"Setting up Storage network interface {storage_dev} with MTU {storage_mtu}",
        state="i",
    )

    common.run_os_command(f"ip link set {storage_dev} mtu {storage_mtu} up")

    if storage_dev == cluster_dev:
        if storage_dev_ip != cluster_dev_ip:
            logger.out(
                f"Setting up Storage network on Cluster network bridge with IP {storage_dev_ip}",
                state="i",
            )

            common.run_os_command(
                f"ip address add {storage_dev_ip} dev brcluster")
    else:
        logger.out(
            f"Setting up Storage network bridge on interface {storage_dev} with IP {storage_dev_ip}",
            state="i",
        )

        common.run_os_command("brctl addbr brstorage")
        common.run_os_command(f"brctl addif brstorage {storage_dev}")
        common.run_os_command(f"ip link set brstorage mtu {storage_mtu} up")
        common.run_os_command(f"ip address add {storage_dev_ip} dev brstorage")

    # Set up the Upstream interface
    upstream_dev = config["upstream_dev"]
    upstream_mtu = config["upstream_mtu"]
    upstream_dev_ip = config["upstream_dev_ip"]

    logger.out(
        f"Setting up Upstream network interface {upstream_dev} with MTU {upstream_mtu}",
        state="i",
    )

    if upstream_dev == cluster_dev:
        if upstream_dev_ip != cluster_dev_ip:
            logger.out(
                f"Setting up Upstream network on Cluster network bridge with IP {upstream_dev_ip}",
                state="i",
            )

            common.run_os_command(
                f"ip address add {upstream_dev_ip} dev brcluster")
    else:
        logger.out(
            f"Setting up Upstream network bridge on interface {upstream_dev} with IP {upstream_dev_ip}",
            state="i",
        )

        common.run_os_command("brctl addbr brupstream")
        common.run_os_command(f"brctl addif brupstream {upstream_dev}")
        common.run_os_command(f"ip link set brupstream mtu {upstream_mtu} up")
        common.run_os_command(
            f"ip address add {upstream_dev_ip} dev brupstream")

    upstream_gateway = config["upstream_gateway"]
    if upstream_gateway is not None:
        logger.out(
            f"Setting up Upstream network default gateway IP {upstream_gateway}",
            state="i",
        )
        if upstream_dev == cluster_dev:
            common.run_os_command(
                f"ip route add default via {upstream_gateway} dev brcluster")
        else:
            common.run_os_command(
                f"ip route add default via {upstream_gateway} dev brupstream")

    # Set up sysctl tweaks to optimize networking
    # Enable routing functions
    common.run_os_command("sysctl net.ipv4.ip_forward=1")
    common.run_os_command("sysctl net.ipv6.ip_forward=1")
    # Enable send redirects
    common.run_os_command("sysctl net.ipv4.conf.all.send_redirects=1")
    common.run_os_command("sysctl net.ipv4.conf.default.send_redirects=1")
    common.run_os_command("sysctl net.ipv6.conf.all.send_redirects=1")
    common.run_os_command("sysctl net.ipv6.conf.default.send_redirects=1")
    # Accept source routes
    common.run_os_command("sysctl net.ipv4.conf.all.accept_source_route=1")
    common.run_os_command("sysctl net.ipv4.conf.default.accept_source_route=1")
    common.run_os_command("sysctl net.ipv6.conf.all.accept_source_route=1")
    common.run_os_command("sysctl net.ipv6.conf.default.accept_source_route=1")
    # Disable RP filtering on Cluster and Upstream interfaces (to allow traffic pivoting)
    common.run_os_command(f"sysctl net.ipv4.conf.{cluster_dev}.rp_filter=0")
    common.run_os_command("sysctl net.ipv4.conf.brcluster.rp_filter=0")
    common.run_os_command(f"sysctl net.ipv4.conf.{upstream_dev}.rp_filter=0")
    common.run_os_command("sysctl net.ipv4.conf.brupstream.rp_filter=0")
    common.run_os_command(f"sysctl net.ipv6.conf.{cluster_dev}.rp_filter=0")
    common.run_os_command("sysctl net.ipv6.conf.brcluster.rp_filter=0")
    common.run_os_command(f"sysctl net.ipv6.conf.{upstream_dev}.rp_filter=0")
    common.run_os_command("sysctl net.ipv6.conf.brupstream.rp_filter=0")

    # Stop DNSMasq if it is running
    common.run_os_command("systemctl stop dnsmasq.service")

    logger.out("Waiting 3 seconds for networking to come up", state="s")
    sleep(3)
Exemplo n.º 25
0
    def become_secondary(self):
        """
        Relinquish primary coordinator status to a peer node
        """
        time.sleep(0.2)  # Initial delay for the first writer to grab the lock

        # Synchronize nodes A (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase A", state="i")
        lock.acquire()
        self.logger.out("Acquired read lock for synchronization phase A", state="o")
        self.logger.out("Releasing read lock for synchronization phase A", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase A", state="o")

        # Synchronize nodes B (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase B", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase B", state="o")
        time.sleep(0.2)  # Time fir reader to acquire the lock
        # 1. Stop DNS aggregator
        self.dns_aggregator.stop_aggregator()
        # 2. Stop DHCP servers
        for network in self.d_network:
            self.d_network[network].stopDHCPServer()
        self.logger.out("Releasing write lock for synchronization phase B", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase B", state="o")
        # 3. Stop client API
        if self.config["enable_api"]:
            self.logger.out("Stopping PVC API client service", state="i")
            common.run_os_command("systemctl stop pvcapid.service")
            common.run_os_command("systemctl disable pvcapid.service")
        # 4. Stop metadata API
        self.metadata_api.stop()
        time.sleep(0.1)  # Time fir new writer to acquire the lock

        # Synchronize nodes C (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase C", state="i")
        lock.acquire()
        self.logger.out("Acquired read lock for synchronization phase C", state="o")
        # 5. Remove Upstream floating IP
        self.logger.out(
            "Removing floating upstream IP {}/{} from interface {}".format(
                self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream"
            ),
            state="o",
        )
        common.removeIPAddress(
            self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream"
        )
        self.logger.out("Releasing read lock for synchronization phase C", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase C", state="o")

        # Synchronize nodes D (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase D", state="i")
        lock.acquire()
        self.logger.out("Acquired read lock for synchronization phase D", state="o")
        # 6. Remove Cluster & Storage floating IP
        self.logger.out(
            "Removing floating management IP {}/{} from interface {}".format(
                self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster"
            ),
            state="o",
        )
        common.removeIPAddress(
            self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster"
        )
        self.logger.out(
            "Removing floating storage IP {}/{} from interface {}".format(
                self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage"
            ),
            state="o",
        )
        common.removeIPAddress(
            self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage"
        )
        self.logger.out("Releasing read lock for synchronization phase D", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase D", state="o")

        # Synchronize nodes E (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase E", state="i")
        lock.acquire()
        self.logger.out("Acquired read lock for synchronization phase E", state="o")
        # 7. Remove Metadata link-local IP
        self.logger.out(
            "Removing Metadata link-local IP {}/{} from interface {}".format(
                "169.254.169.254", "32", "lo"
            ),
            state="o",
        )
        common.removeIPAddress("169.254.169.254", "32", "lo")
        self.logger.out("Releasing read lock for synchronization phase E", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase E", state="o")

        # Synchronize nodes F (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase F", state="i")
        lock.acquire()
        self.logger.out("Acquired read lock for synchronization phase F", state="o")
        # 8. Remove gateway IPs
        for network in self.d_network:
            self.d_network[network].removeGateways()
        self.logger.out("Releasing read lock for synchronization phase F", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase F", state="o")

        # Synchronize nodes G (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase G", state="i")
        try:
            lock.acquire(timeout=60)  # Don't wait forever and completely block us
            self.logger.out("Acquired read lock for synchronization phase G", state="o")
        except Exception:
            pass
        self.logger.out("Releasing read lock for synchronization phase G", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase G", state="o")

        # Wait 2 seconds for everything to stabilize before we declare all-done
        time.sleep(2)
        self.zkhandler.write([(("node.state.router", self.name), "secondary")])
        self.logger.out(
            "Node {} transitioned to secondary state".format(self.name), state="o"
        )
Exemplo n.º 26
0
    def become_primary(self):
        """
        Acquire primary coordinator status from a peer node
        """
        # Lock the primary node until transition is complete
        primary_lock = self.zkhandler.exclusivelock("base.config.primary_node")
        primary_lock.acquire()

        # Ensure our lock key is populated
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])

        # Synchronize nodes A (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase A", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase A", state="o")
        time.sleep(1)  # Time fir reader to acquire the lock
        self.logger.out("Releasing write lock for synchronization phase A", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase A", state="o")
        time.sleep(0.1)  # Time fir new writer to acquire the lock

        # Synchronize nodes B (I am reader)
        lock = self.zkhandler.readlock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring read lock for synchronization phase B", state="i")
        lock.acquire()
        self.logger.out("Acquired read lock for synchronization phase B", state="o")
        self.logger.out("Releasing read lock for synchronization phase B", state="i")
        lock.release()
        self.logger.out("Released read lock for synchronization phase B", state="o")

        # Synchronize nodes C (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase C", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase C", state="o")
        time.sleep(0.5)  # Time fir reader to acquire the lock
        # 1. Add Upstream floating IP
        self.logger.out(
            "Creating floating upstream IP {}/{} on interface {}".format(
                self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream"
            ),
            state="o",
        )
        common.createIPAddress(
            self.upstream_floatingipaddr, self.upstream_cidrnetmask, "brupstream"
        )
        self.logger.out("Releasing write lock for synchronization phase C", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase C", state="o")

        # Synchronize nodes D (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase D", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase D", state="o")
        time.sleep(0.2)  # Time fir reader to acquire the lock
        # 2. Add Cluster & Storage floating IP
        self.logger.out(
            "Creating floating management IP {}/{} on interface {}".format(
                self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster"
            ),
            state="o",
        )
        common.createIPAddress(
            self.cluster_floatingipaddr, self.cluster_cidrnetmask, "brcluster"
        )
        self.logger.out(
            "Creating floating storage IP {}/{} on interface {}".format(
                self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage"
            ),
            state="o",
        )
        common.createIPAddress(
            self.storage_floatingipaddr, self.storage_cidrnetmask, "brstorage"
        )
        self.logger.out("Releasing write lock for synchronization phase D", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase D", state="o")

        # Synchronize nodes E (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase E", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase E", state="o")
        time.sleep(0.2)  # Time fir reader to acquire the lock
        # 3. Add Metadata link-local IP
        self.logger.out(
            "Creating Metadata link-local IP {}/{} on interface {}".format(
                "169.254.169.254", "32", "lo"
            ),
            state="o",
        )
        common.createIPAddress("169.254.169.254", "32", "lo")
        self.logger.out("Releasing write lock for synchronization phase E", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase E", state="o")

        # Synchronize nodes F (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase F", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase F", state="o")
        time.sleep(0.2)  # Time fir reader to acquire the lock
        # 4. Add gateway IPs
        for network in self.d_network:
            self.d_network[network].createGateways()
        self.logger.out("Releasing write lock for synchronization phase F", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase F", state="o")

        # Synchronize nodes G (I am writer)
        lock = self.zkhandler.writelock("base.config.primary_node.sync_lock")
        self.logger.out("Acquiring write lock for synchronization phase G", state="i")
        lock.acquire()
        self.logger.out("Acquired write lock for synchronization phase G", state="o")
        time.sleep(0.2)  # Time fir reader to acquire the lock
        # 5. Transition Patroni primary
        self.logger.out("Setting Patroni leader to this node", state="i")
        tick = 1
        patroni_failed = True
        # As long as we're in takeover, keep trying to set the Patroni leader to us
        while self.router_state == "takeover":
            # Switch Patroni leader to the local instance
            retcode, stdout, stderr = common.run_os_command(
                """
                patronictl
                    -c /etc/patroni/config.yml
                    switchover
                    --candidate {}
                    --force
                    pvc
                """.format(
                    self.name
                )
            )

            # Combine the stdout and stderr and strip the output
            # Patronictl's output is pretty junky
            if stderr:
                stdout += stderr
            stdout = stdout.strip()

            # Handle our current Patroni leader being us
            if stdout and stdout.split("\n")[-1].split() == [
                "Error:",
                "Switchover",
                "target",
                "and",
                "source",
                "are",
                "the",
                "same.",
            ]:
                self.logger.out(
                    "Failed to switch Patroni leader to ourselves; this is fine\n{}".format(
                        stdout
                    ),
                    state="w",
                )
                patroni_failed = False
                break
            # Handle a failed switchover
            elif stdout and (
                stdout.split("\n")[-1].split()[:2] == ["Switchover", "failed,"]
                or stdout.strip().split("\n")[-1].split()[:1] == ["Error"]
            ):
                if tick > 4:
                    self.logger.out(
                        "Failed to switch Patroni leader after 5 tries; aborting",
                        state="e",
                    )
                    break
                else:
                    self.logger.out(
                        "Failed to switch Patroni leader; retrying [{}/5]\n{}\n".format(
                            tick, stdout
                        ),
                        state="e",
                    )
                    tick += 1
                    time.sleep(5)
            # Otherwise, we succeeded
            else:
                self.logger.out(
                    "Successfully switched Patroni leader\n{}".format(stdout), state="o"
                )
                patroni_failed = False
                time.sleep(0.2)
                break
        # 6. Start client API (and provisioner worker)
        if self.config["enable_api"]:
            self.logger.out("Starting PVC API client service", state="i")
            common.run_os_command("systemctl enable pvcapid.service")
            common.run_os_command("systemctl start pvcapid.service")
            self.logger.out("Starting PVC Provisioner Worker service", state="i")
            common.run_os_command("systemctl start pvcapid-worker.service")
        # 7. Start metadata API; just continue if we fail
        self.metadata_api.start()
        # 8. Start DHCP servers
        for network in self.d_network:
            self.d_network[network].startDHCPServer()
        # 9. Start DNS aggregator; just continue if we fail
        if not patroni_failed:
            self.dns_aggregator.start_aggregator()
        else:
            self.logger.out(
                "Not starting DNS aggregator due to Patroni failures", state="e"
            )
        self.logger.out("Releasing write lock for synchronization phase G", state="i")
        self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
        lock.release()
        self.logger.out("Released write lock for synchronization phase G", state="o")

        # Wait 2 seconds for everything to stabilize before we declare all-done
        time.sleep(2)
        primary_lock.release()
        self.zkhandler.write([(("node.state.router", self.name), "primary")])
        self.logger.out(
            "Node {} transitioned to primary state".format(self.name), state="o"
        )
Exemplo n.º 27
0
def entrypoint():
    keepalive_timer = None

    # Get our configuration
    config = pvcnoded.util.config.get_configuration()
    config["pvcnoded_version"] = version

    # Set some useful booleans for later (fewer characters)
    debug = config["debug"]
    if debug:
        print("DEBUG MODE ENABLED")

    # Create and validate our directories
    pvcnoded.util.config.validate_directories(config)

    # Set up the logger instance
    logger = log.Logger(config)

    # Print our startup message
    logger.out("")
    logger.out("|----------------------------------------------------------|")
    logger.out("|                                                          |")
    logger.out("|           ███████████ ▜█▙      ▟█▛ █████ █ █ █           |")
    logger.out("|                    ██  ▜█▙    ▟█▛  ██                    |")
    logger.out("|           ███████████   ▜█▙  ▟█▛   ██                    |")
    logger.out("|           ██             ▜█▙▟█▛    ███████████           |")
    logger.out("|                                                          |")
    logger.out("|----------------------------------------------------------|")
    logger.out(
        "| Parallel Virtual Cluster node daemon v{0: <18} |".format(version))
    logger.out("| Debug: {0: <49} |".format(str(config["debug"])))
    logger.out("| FQDN: {0: <50} |".format(config["node_fqdn"]))
    logger.out("| Host: {0: <50} |".format(config["node_hostname"]))
    logger.out("| ID: {0: <52} |".format(config["node_id"]))
    logger.out("| IPMI hostname: {0: <41} |".format(config["ipmi_hostname"]))
    logger.out("| Machine details:                                         |")
    logger.out("|   CPUs: {0: <48} |".format(config["static_data"][0]))
    logger.out("|   Arch: {0: <48} |".format(config["static_data"][3]))
    logger.out("|   OS: {0: <50} |".format(config["static_data"][2]))
    logger.out("|   Kernel: {0: <46} |".format(config["static_data"][1]))
    logger.out("|----------------------------------------------------------|")
    logger.out("")
    logger.out(f'Starting pvcnoded on host {config["node_fqdn"]}', state="s")

    if config["enable_networking"]:
        if config["enable_sriov"]:
            # Set up SR-IOV devices
            pvcnoded.util.networking.setup_sriov(logger, config)

        # Set up our interfaces
        pvcnoded.util.networking.setup_interfaces(logger, config)

    # Get list of coordinator nodes
    coordinator_nodes = config["coordinators"]

    if config["node_hostname"] in coordinator_nodes:
        # We are indeed a coordinator node
        config["daemon_mode"] = "coordinator"
        logger.out(
            f"This node is a {logger.fmt_blue}coordinator{logger.fmt_end}",
            state="i")
    else:
        # We are a hypervisor node
        config["daemon_mode"] = "hypervisor"
        logger.out(
            f"This node is a {logger.fmt_cyan}hypervisor{logger.fmt_end}",
            state="i")

    pvcnoded.util.services.start_system_services(logger, config)

    # Connect to Zookeeper and return our handler and current schema version
    zkhandler, node_schema_version = pvcnoded.util.zookeeper.connect(
        logger, config)

    # Watch for a global schema update and fire
    # This will only change by the API when triggered after seeing all nodes can update
    @zkhandler.zk_conn.DataWatch(zkhandler.schema.path("base.schema.version"))
    def update_schema(new_schema_version, stat, event=""):
        nonlocal zkhandler, keepalive_timer, node_schema_version

        try:
            new_schema_version = int(new_schema_version.decode("ascii"))
        except Exception:
            new_schema_version = 0

        if new_schema_version == node_schema_version:
            return True

        logger.out("Hot update of schema version started", state="s")
        logger.out(
            f"Current version: {node_schema_version,}  New version: {new_schema_version}",
            state="s",
        )

        # Prevent any keepalive updates while this happens
        if keepalive_timer is not None:
            pvcnoded.util.keepalive.stop_keepalive_timer(
                logger, keepalive_timer)
            sleep(1)

        # Perform the migration (primary only)
        if zkhandler.read(
                "base.config.primary_node") == config["node_hostname"]:
            logger.out("Primary node acquiring exclusive lock", state="s")
            # Wait for things to settle
            sleep(0.5)
            # Acquire a write lock on the root key
            with zkhandler.exclusivelock("base.schema.version"):
                # Perform the schema migration tasks
                logger.out("Performing schema update", state="s")
                if new_schema_version > node_schema_version:
                    zkhandler.schema.migrate(zkhandler, new_schema_version)
                if new_schema_version < node_schema_version:
                    zkhandler.schema.rollback(zkhandler, new_schema_version)
        # Wait for the exclusive lock to be lifted
        else:
            logger.out("Non-primary node acquiring read lock", state="s")
            # Wait for things to settle
            sleep(1)
            # Wait for a read lock
            lock = zkhandler.readlock("base.schema.version")
            lock.acquire()
            # Wait a bit more for the primary to return to normal
            sleep(1)

        # Update the local schema version
        logger.out("Updating node target schema version", state="s")
        zkhandler.write([(("node.data.active_schema", config["node_hostname"]),
                          new_schema_version)])
        node_schema_version = new_schema_version

        # Restart the API daemons if applicable
        logger.out("Restarting services", state="s")
        common.run_os_command("systemctl restart pvcapid-worker.service")
        if zkhandler.read(
                "base.config.primary_node") == config["node_hostname"]:
            common.run_os_command("systemctl restart pvcapid.service")

        # Restart ourselves with the new schema
        logger.out("Reloading node daemon", state="s")
        try:
            zkhandler.disconnect(persistent=True)
            del zkhandler
        except Exception:
            pass
        os.execv(sys.argv[0], sys.argv)

    # Validate the schema
    pvcnoded.util.zookeeper.validate_schema(logger, zkhandler)

    # Define a cleanup function
    def cleanup(failure=False):
        nonlocal logger, zkhandler, keepalive_timer, d_domain

        logger.out("Terminating pvcnoded and cleaning up", state="s")

        # Set shutdown state in Zookeeper
        zkhandler.write([(("node.state.daemon", config["node_hostname"]),
                          "shutdown")])

        # Waiting for any flushes to complete
        logger.out("Waiting for any active flushes", state="s")
        try:
            if this_node is not None:
                while this_node.flush_thread is not None:
                    sleep(0.5)
        except Exception:
            # We really don't care here, just proceed
            pass

        # Stop console logging on all VMs
        logger.out("Stopping domain console watchers", state="s")
        try:
            if d_domain is not None:
                for domain in d_domain:
                    if d_domain[domain].getnode() == config["node_hostname"]:
                        d_domain[domain].console_log_instance.stop()
        except Exception:
            pass

        # Force into secondary coordinator state if needed
        try:
            if this_node.router_state == "primary" and len(d_node) > 1:
                zkhandler.write([("base.config.primary_node", "none")])
                logger.out("Waiting for primary migration", state="s")
                timeout = 240
                count = 0
                while this_node.router_state != "secondary" and count < timeout:
                    sleep(0.5)
                    count += 1
        except Exception:
            pass

        # Stop keepalive thread
        try:
            pvcnoded.util.keepalive.stop_keepalive_timer(
                logger, keepalive_timer)

            logger.out("Performing final keepalive update", state="s")
            pvcnoded.util.keepalive.node_keepalive(logger, config, zkhandler,
                                                   this_node)
        except Exception:
            pass

        # Set stop state in Zookeeper
        zkhandler.write([(("node.state.daemon", config["node_hostname"]),
                          "stop")])

        # Forcibly terminate dnsmasq because it gets stuck sometimes
        common.run_os_command("killall dnsmasq")

        # Close the Zookeeper connection
        try:
            zkhandler.disconnect(persistent=True)
            del zkhandler
        except Exception:
            pass

        logger.out("Terminated pvc daemon", state="s")
        logger.terminate()

        if failure:
            retcode = 1
        else:
            retcode = 0

        os._exit(retcode)

    # Termination function
    def term(signum="", frame=""):
        cleanup(failure=False)

    # Hangup (logrotate) function
    def hup(signum="", frame=""):
        if config["file_logging"]:
            logger.hup()

    # Handle signals gracefully
    signal.signal(signal.SIGTERM, term)
    signal.signal(signal.SIGINT, term)
    signal.signal(signal.SIGQUIT, term)
    signal.signal(signal.SIGHUP, hup)

    # Set up this node in Zookeeper
    pvcnoded.util.zookeeper.setup_node(logger, config, zkhandler)

    # Check that the primary node key exists and create it with us as primary if not
    try:
        current_primary = zkhandler.read("base.config.primary_node")
    except Exception:
        current_primary = "none"

    if current_primary and current_primary != "none":
        logger.out(
            f"Current primary node is {logger.fmt_blue}{current_primary}{logger.fmt_end}",
            state="i",
        )
    else:
        if config["daemon_mode"] == "coordinator":
            logger.out("No primary node found; setting us as primary",
                       state="i")
            zkhandler.write([("base.config.primary_node",
                              config["node_hostname"])])

    # Ensure that IPMI is reachable and working
    if not pvcnoded.util.fencing.verify_ipmi(config["ipmi_hostname"],
                                             config["ipmi_username"],
                                             config["ipmi_password"]):
        logger.out(
            "Our IPMI is not reachable; fencing of this node will likely fail",
            state="w",
        )

    # Validate libvirt
    if not pvcnoded.util.libvirt.validate_libvirtd(logger, config):
        cleanup(failure=True)

    # Set up NFT
    pvcnoded.util.networking.create_nft_configuration(logger, config)

    # Create our object dictionaries
    logger.out("Setting up objects", state="s")

    d_node = dict()
    node_list = list()
    d_network = dict()
    network_list = list()
    sriov_pf_list = list()
    d_sriov_vf = dict()
    sriov_vf_list = list()
    d_domain = dict()
    domain_list = list()
    d_osd = dict()
    osd_list = list()
    d_pool = dict()
    pool_list = list()
    d_volume = dict()
    volume_list = dict()

    if config["enable_networking"] and config["daemon_mode"] == "coordinator":
        # Create an instance of the DNS Aggregator and Metadata API if we're a coordinator
        dns_aggregator = DNSAggregatorInstance.DNSAggregatorInstance(
            config, logger)
        metadata_api = MetadataAPIInstance.MetadataAPIInstance(
            zkhandler, config, logger)
    else:
        dns_aggregator = None
        metadata_api = None

    #
    # Zookeeper watchers for objects
    #

    # Node objects
    @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.node"))
    def set_nodes(new_node_list):
        nonlocal d_node, node_list

        # Add missing nodes to list
        for node in [node for node in new_node_list if node not in node_list]:
            d_node[node] = NodeInstance.NodeInstance(
                node,
                config["node_hostname"],
                zkhandler,
                config,
                logger,
                d_node,
                d_network,
                d_domain,
                dns_aggregator,
                metadata_api,
            )

        # Remove deleted nodes from list
        for node in [node for node in node_list if node not in new_node_list]:
            del d_node[node]

        node_list = new_node_list
        logger.out(
            f'{logger.fmt_blue}Node list:{logger.fmt_end} {" ".join(node_list)}',
            state="i",
        )

        # Update node objects lists
        for node in d_node:
            d_node[node].update_node_list(d_node)

    # Create helpful alias for this node
    this_node = d_node[config["node_hostname"]]

    # Maintenance status
    @zkhandler.zk_conn.DataWatch(
        zkhandler.schema.path("base.config.maintenance"))
    def update_maintenance(_maintenance, stat):
        try:
            maintenance = bool(strtobool(_maintenance.decode("ascii")))
        except Exception:
            maintenance = False

        this_node.maintenance = maintenance

    # Primary node
    @zkhandler.zk_conn.DataWatch(
        zkhandler.schema.path("base.config.primary_node"))
    def update_primary_node(new_primary, stat, event=""):
        try:
            new_primary = new_primary.decode("ascii")
        except AttributeError:
            new_primary = "none"
        key_version = stat.version

        # TODO: Move this to the Node structure
        if new_primary != this_node.primary_node:
            if config["daemon_mode"] == "coordinator":
                # We're a coordinator and there's no primary
                if new_primary == "none":
                    if (this_node.daemon_state == "run"
                            and this_node.router_state
                            not in ["primary", "takeover", "relinquish"]):
                        logger.out("Contending for primary coordinator state",
                                   state="i")
                        # Acquire an exclusive lock on the primary_node key
                        primary_lock = zkhandler.exclusivelock(
                            "base.config.primary_node")
                        try:
                            # This lock times out after 0.4s, which is 0.1s less than the pre-takeover
                            # timeout beow. This ensures a primary takeover will not deadlock against
                            # a node which has failed the contention
                            primary_lock.acquire(timeout=0.4)
                            # Ensure that when we get the lock the versions are still consistent and
                            # that another node hasn't already acquired the primary state (maybe we're
                            # extremely slow to respond)
                            if (key_version == zkhandler.zk_conn.get(
                                    zkhandler.schema.path(
                                        "base.config.primary_node"))[1].version
                                ):
                                # Set the primary to us
                                logger.out(
                                    "Acquiring primary coordinator state",
                                    state="o")
                                zkhandler.write([(
                                    "base.config.primary_node",
                                    config["node_hostname"],
                                )])
                            # Cleanly release the lock
                            primary_lock.release()
                        # We timed out acquiring a lock, or failed to write, which means we failed the
                        # contention and should just log that
                        except Exception:
                            logger.out(
                                "Timed out contending for primary coordinator state",
                                state="i",
                            )
                elif new_primary == config["node_hostname"]:
                    if this_node.router_state == "secondary":
                        # Wait for 0.5s to ensure other contentions time out, then take over
                        sleep(0.5)
                        zkhandler.write([(
                            ("node.state.router", config["node_hostname"]),
                            "takeover",
                        )])
                else:
                    if this_node.router_state == "primary":
                        # Wait for 0.5s to ensure other contentions time out, then relinquish
                        sleep(0.5)
                        zkhandler.write([(
                            ("node.state.router", config["node_hostname"]),
                            "relinquish",
                        )])
            else:
                zkhandler.write([(("node.state.router",
                                   config["node_hostname"]), "client")])

            # TODO: Turn this into a function like the others for clarity
            for node in d_node:
                d_node[node].primary_node = new_primary

    if config["enable_networking"]:
        # Network objects
        @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.network"))
        def update_networks(new_network_list):
            nonlocal network_list, d_network

            # Add any missing networks to the list
            for network in [
                    network for network in new_network_list
                    if network not in network_list
            ]:
                d_network[network] = VXNetworkInstance.VXNetworkInstance(
                    network, zkhandler, config, logger, this_node,
                    dns_aggregator)
                # TODO: Move this to the Network structure
                if (config["daemon_mode"] == "coordinator"
                        and d_network[network].nettype == "managed"):
                    try:
                        dns_aggregator.add_network(d_network[network])
                    except Exception as e:
                        logger.out(
                            f"Failed to create DNS Aggregator for network {network}: {e}",
                            state="w",
                        )
                # Start primary functionality
                if (this_node.router_state == "primary"
                        and d_network[network].nettype == "managed"):
                    d_network[network].createGateways()
                    d_network[network].startDHCPServer()

            # Remove any missing networks from the list
            for network in [
                    network for network in network_list
                    if network not in new_network_list
            ]:
                # TODO: Move this to the Network structure
                if d_network[network].nettype == "managed":
                    # Stop primary functionality
                    if this_node.router_state == "primary":
                        d_network[network].stopDHCPServer()
                        d_network[network].removeGateways()
                        dns_aggregator.remove_network(d_network[network])
                    # Stop firewalling
                    d_network[network].removeFirewall()
                # Delete the network
                d_network[network].removeNetwork()
                del d_network[network]

            # Update the new list
            network_list = new_network_list
            logger.out(
                f'{logger.fmt_blue}Network list:{logger.fmt_end} {" ".join(network_list)}',
                state="i",
            )

            # Update node objects list
            for node in d_node:
                d_node[node].update_network_list(d_network)

        # Add the SR-IOV PFs and VFs to Zookeeper
        # These do not behave like the objects; they are not dynamic (the API cannot change them), and they
        # exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node
        # basis, under the Node configuration tree.
        # MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys
        if (config["enable_sriov"]
                and zkhandler.schema.schema.get("sriov_pf", None) is not None):
            vf_list = list()
            for device in config["sriov_device"]:
                pf = device["phy"]
                vfcount = device["vfcount"]
                if device.get("mtu", None) is None:
                    mtu = 1500
                else:
                    mtu = device["mtu"]

                # Create the PF device in Zookeeper
                zkhandler.write([
                    (
                        ("node.sriov.pf", config["node_hostname"], "sriov_pf",
                         pf),
                        "",
                    ),
                    (
                        (
                            "node.sriov.pf",
                            config["node_hostname"],
                            "sriov_pf.mtu",
                            pf,
                        ),
                        mtu,
                    ),
                    (
                        (
                            "node.sriov.pf",
                            config["node_hostname"],
                            "sriov_pf.vfcount",
                            pf,
                        ),
                        vfcount,
                    ),
                ])
                # Append the device to the list of PFs
                sriov_pf_list.append(pf)

                # Get the list of VFs from `ip link show`
                vf_list = json.loads(
                    common.run_os_command(f"ip --json link show {pf}")
                    [1])[0].get("vfinfo_list", [])
                for vf in vf_list:
                    # {
                    #   'vf': 3,
                    #   'link_type': 'ether',
                    #   'address': '00:00:00:00:00:00',
                    #   'broadcast': 'ff:ff:ff:ff:ff:ff',
                    #   'vlan_list': [{'vlan': 101, 'qos': 2}],
                    #   'rate': {'max_tx': 0, 'min_tx': 0},
                    #   'spoofchk': True,
                    #   'link_state': 'auto',
                    #   'trust': False,
                    #   'query_rss_en': False
                    # }
                    vfphy = f'{pf}v{vf["vf"]}'

                    # Get the PCIe bus information
                    dev_pcie_path = None
                    try:
                        with open(f"/sys/class/net/{vfphy}/device/uevent"
                                  ) as vfh:
                            dev_uevent = vfh.readlines()
                        for line in dev_uevent:
                            if re.match(r"^PCI_SLOT_NAME=.*", line):
                                dev_pcie_path = line.rstrip().split("=")[-1]
                    except FileNotFoundError:
                        # Something must already be using the PCIe device
                        pass

                    # Add the VF to Zookeeper if it does not yet exist
                    if not zkhandler.exists(
                        ("node.sriov.vf", config["node_hostname"], "sriov_vf",
                         vfphy)):
                        if dev_pcie_path is not None:
                            pcie_domain, pcie_bus, pcie_slot, pcie_function = re.split(
                                r":|\.", dev_pcie_path)
                        else:
                            # We can't add the device - for some reason we can't get any information on its PCIe bus path,
                            # so just ignore this one, and continue.
                            # This shouldn't happen under any real circumstances, unless the admin tries to attach a non-existent
                            # VF to a VM manually, then goes ahead and adds that VF to the system with the VM running.
                            continue

                        zkhandler.write([
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf",
                                    vfphy,
                                ),
                                "",
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.pf",
                                    vfphy,
                                ),
                                pf,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.mtu",
                                    vfphy,
                                ),
                                mtu,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.mac",
                                    vfphy,
                                ),
                                vf["address"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.phy_mac",
                                    vfphy,
                                ),
                                vf["address"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config",
                                    vfphy,
                                ),
                                "",
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.vlan_id",
                                    vfphy,
                                ),
                                vf["vlan_list"][0].get("vlan", "0"),
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.vlan_qos",
                                    vfphy,
                                ),
                                vf["vlan_list"][0].get("qos", "0"),
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.tx_rate_min",
                                    vfphy,
                                ),
                                vf["rate"]["min_tx"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.tx_rate_max",
                                    vfphy,
                                ),
                                vf["rate"]["max_tx"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.spoof_check",
                                    vfphy,
                                ),
                                vf["spoofchk"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.link_state",
                                    vfphy,
                                ),
                                vf["link_state"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.trust",
                                    vfphy,
                                ),
                                vf["trust"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.config.query_rss",
                                    vfphy,
                                ),
                                vf["query_rss_en"],
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.pci",
                                    vfphy,
                                ),
                                "",
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.pci.domain",
                                    vfphy,
                                ),
                                pcie_domain,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.pci.bus",
                                    vfphy,
                                ),
                                pcie_bus,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.pci.slot",
                                    vfphy,
                                ),
                                pcie_slot,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.pci.function",
                                    vfphy,
                                ),
                                pcie_function,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.used",
                                    vfphy,
                                ),
                                False,
                            ),
                            (
                                (
                                    "node.sriov.vf",
                                    config["node_hostname"],
                                    "sriov_vf.used_by",
                                    vfphy,
                                ),
                                "",
                            ),
                        ])

                    # Append the device to the list of VFs
                    sriov_vf_list.append(vfphy)

            # Remove any obsolete PFs from Zookeeper if they go away
            for pf in zkhandler.children(
                ("node.sriov.pf", config["node_hostname"])):
                if pf not in sriov_pf_list:
                    zkhandler.delete([("node.sriov.pf",
                                       config["node_hostname"], "sriov_pf", pf)
                                      ])
            # Remove any obsolete VFs from Zookeeper if their PF goes away
            for vf in zkhandler.children(
                ("node.sriov.vf", config["node_hostname"])):
                vf_pf = zkhandler.read(
                    ("node.sriov.vf", config["node_hostname"], "sriov_vf.pf",
                     vf))
                if vf_pf not in sriov_pf_list:
                    zkhandler.delete([("node.sriov.vf",
                                       config["node_hostname"], "sriov_vf", vf)
                                      ])

            # SR-IOV VF objects
            # This is a ChildrenWatch just for consistency; the list never changes at runtime
            @zkhandler.zk_conn.ChildrenWatch(
                zkhandler.schema.path("node.sriov.vf",
                                      config["node_hostname"]))
            def update_sriov_vfs(new_sriov_vf_list):
                nonlocal sriov_vf_list, d_sriov_vf

                # Add VFs to the list
                for vf in common.sortInterfaceNames(new_sriov_vf_list):
                    d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance(
                        vf, zkhandler, config, logger, this_node)

                sriov_vf_list = sorted(new_sriov_vf_list)
                logger.out(
                    f'{logger.fmt_blue}SR-IOV VF list:{logger.fmt_end} {" ".join(sriov_vf_list)}',
                    state="i",
                )

    if config["enable_hypervisor"]:
        # VM command pipeline key
        @zkhandler.zk_conn.DataWatch(zkhandler.schema.path("base.cmd.domain"))
        def run_domain_command(data, stat, event=""):
            if data:
                VMInstance.vm_command(zkhandler, logger, this_node,
                                      data.decode("ascii"))

        # VM domain objects
        @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.domain"))
        def update_domains(new_domain_list):
            nonlocal domain_list, d_domain

            # Add missing domains to the list
            for domain in [
                    domain for domain in new_domain_list
                    if domain not in domain_list
            ]:
                d_domain[domain] = VMInstance.VMInstance(
                    domain, zkhandler, config, logger, this_node)

            # Remove any deleted domains from the list
            for domain in [
                    domain for domain in domain_list
                    if domain not in new_domain_list
            ]:
                del d_domain[domain]

            # Update the new list
            domain_list = new_domain_list
            logger.out(
                f'{logger.fmt_blue}Domain list:{logger.fmt_end} {" ".join(domain_list)}',
                state="i",
            )

            # Update node objects' list
            for node in d_node:
                d_node[node].update_domain_list(d_domain)

    if config["enable_storage"]:
        # Ceph command pipeline key
        @zkhandler.zk_conn.DataWatch(zkhandler.schema.path("base.cmd.ceph"))
        def run_ceph_command(data, stat, event=""):
            if data:
                CephInstance.ceph_command(zkhandler, logger, this_node,
                                          data.decode("ascii"), d_osd)

        # OSD objects
        @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.osd"))
        def update_osds(new_osd_list):
            nonlocal osd_list, d_osd

            # Add any missing OSDs to the list
            for osd in [osd for osd in new_osd_list if osd not in osd_list]:
                d_osd[osd] = CephInstance.CephOSDInstance(
                    zkhandler, this_node, osd)

            # Remove any deleted OSDs from the list
            for osd in [osd for osd in osd_list if osd not in new_osd_list]:
                del d_osd[osd]

            # Update the new list
            osd_list = new_osd_list
            logger.out(
                f'{logger.fmt_blue}OSD list:{logger.fmt_end} {" ".join(osd_list)}',
                state="i",
            )

        # Pool objects
        @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path("base.pool"))
        def update_pools(new_pool_list):
            nonlocal pool_list, d_pool, volume_list, d_volume

            # Add any missing pools to the list
            for pool in [
                    pool for pool in new_pool_list if pool not in pool_list
            ]:
                d_pool[pool] = CephInstance.CephPoolInstance(
                    zkhandler, this_node, pool)
                # Prepare the volume components for this pool
                volume_list[pool] = list()
                d_volume[pool] = dict()

            # Remove any deleted pools from the list
            for pool in [
                    pool for pool in pool_list if pool not in new_pool_list
            ]:
                del d_pool[pool]

            # Update the new list
            pool_list = new_pool_list
            logger.out(
                f'{logger.fmt_blue}Pool list:{logger.fmt_end} {" ".join(pool_list)}',
                state="i",
            )

            # Volume objects (in each pool)
            for pool in pool_list:

                @zkhandler.zk_conn.ChildrenWatch(
                    zkhandler.schema.path("volume", pool))
                def update_volumes(new_volume_list):
                    nonlocal volume_list, d_volume

                    # Add any missing volumes to the list
                    for volume in [
                            volume for volume in new_volume_list
                            if volume not in volume_list[pool]
                    ]:
                        d_volume[pool][
                            volume] = CephInstance.CephVolumeInstance(
                                zkhandler, this_node, pool, volume)

                    # Remove any deleted volumes from the list
                    for volume in [
                            volume for volume in volume_list[pool]
                            if volume not in new_volume_list
                    ]:
                        del d_volume[pool][volume]

                    # Update the new list
                    volume_list[pool] = new_volume_list
                    logger.out(
                        f'{logger.fmt_blue}Volume list [{pool}]:{logger.fmt_end} {" ".join(volume_list[pool])}',
                        state="i",
                    )

    # Start keepalived thread
    keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
        logger, config, zkhandler, this_node)

    # Tick loop; does nothing since everything is async
    while True:
        try:
            sleep(1)
        except Exception:
            break
Exemplo n.º 28
0
    def cleanup(failure=False):
        nonlocal logger, zkhandler, keepalive_timer, d_domain

        logger.out("Terminating pvcnoded and cleaning up", state="s")

        # Set shutdown state in Zookeeper
        zkhandler.write([(("node.state.daemon", config["node_hostname"]),
                          "shutdown")])

        # Waiting for any flushes to complete
        logger.out("Waiting for any active flushes", state="s")
        try:
            if this_node is not None:
                while this_node.flush_thread is not None:
                    sleep(0.5)
        except Exception:
            # We really don't care here, just proceed
            pass

        # Stop console logging on all VMs
        logger.out("Stopping domain console watchers", state="s")
        try:
            if d_domain is not None:
                for domain in d_domain:
                    if d_domain[domain].getnode() == config["node_hostname"]:
                        d_domain[domain].console_log_instance.stop()
        except Exception:
            pass

        # Force into secondary coordinator state if needed
        try:
            if this_node.router_state == "primary" and len(d_node) > 1:
                zkhandler.write([("base.config.primary_node", "none")])
                logger.out("Waiting for primary migration", state="s")
                timeout = 240
                count = 0
                while this_node.router_state != "secondary" and count < timeout:
                    sleep(0.5)
                    count += 1
        except Exception:
            pass

        # Stop keepalive thread
        try:
            pvcnoded.util.keepalive.stop_keepalive_timer(
                logger, keepalive_timer)

            logger.out("Performing final keepalive update", state="s")
            pvcnoded.util.keepalive.node_keepalive(logger, config, zkhandler,
                                                   this_node)
        except Exception:
            pass

        # Set stop state in Zookeeper
        zkhandler.write([(("node.state.daemon", config["node_hostname"]),
                          "stop")])

        # Forcibly terminate dnsmasq because it gets stuck sometimes
        common.run_os_command("killall dnsmasq")

        # Close the Zookeeper connection
        try:
            zkhandler.disconnect(persistent=True)
            del zkhandler
        except Exception:
            pass

        logger.out("Terminated pvc daemon", state="s")
        logger.terminate()

        if failure:
            retcode = 1
        else:
            retcode = 0

        os._exit(retcode)
Exemplo n.º 29
0
    def update_schema(new_schema_version, stat, event=""):
        nonlocal zkhandler, keepalive_timer, node_schema_version

        try:
            new_schema_version = int(new_schema_version.decode("ascii"))
        except Exception:
            new_schema_version = 0

        if new_schema_version == node_schema_version:
            return True

        logger.out("Hot update of schema version started", state="s")
        logger.out(
            f"Current version: {node_schema_version,}  New version: {new_schema_version}",
            state="s",
        )

        # Prevent any keepalive updates while this happens
        if keepalive_timer is not None:
            pvcnoded.util.keepalive.stop_keepalive_timer(
                logger, keepalive_timer)
            sleep(1)

        # Perform the migration (primary only)
        if zkhandler.read(
                "base.config.primary_node") == config["node_hostname"]:
            logger.out("Primary node acquiring exclusive lock", state="s")
            # Wait for things to settle
            sleep(0.5)
            # Acquire a write lock on the root key
            with zkhandler.exclusivelock("base.schema.version"):
                # Perform the schema migration tasks
                logger.out("Performing schema update", state="s")
                if new_schema_version > node_schema_version:
                    zkhandler.schema.migrate(zkhandler, new_schema_version)
                if new_schema_version < node_schema_version:
                    zkhandler.schema.rollback(zkhandler, new_schema_version)
        # Wait for the exclusive lock to be lifted
        else:
            logger.out("Non-primary node acquiring read lock", state="s")
            # Wait for things to settle
            sleep(1)
            # Wait for a read lock
            lock = zkhandler.readlock("base.schema.version")
            lock.acquire()
            # Wait a bit more for the primary to return to normal
            sleep(1)

        # Update the local schema version
        logger.out("Updating node target schema version", state="s")
        zkhandler.write([(("node.data.active_schema", config["node_hostname"]),
                          new_schema_version)])
        node_schema_version = new_schema_version

        # Restart the API daemons if applicable
        logger.out("Restarting services", state="s")
        common.run_os_command("systemctl restart pvcapid-worker.service")
        if zkhandler.read(
                "base.config.primary_node") == config["node_hostname"]:
            common.run_os_command("systemctl restart pvcapid.service")

        # Restart ourselves with the new schema
        logger.out("Reloading node daemon", state="s")
        try:
            zkhandler.disconnect(persistent=True)
            del zkhandler
        except Exception:
            pass
        os.execv(sys.argv[0], sys.argv)
Exemplo n.º 30
0
def get_configuration():
    """
    Parse the configuration of the node daemon.
    """
    pvcnoded_config_file = get_configuration_path()

    print('Loading configuration from file "{}"'.format(pvcnoded_config_file))

    with open(pvcnoded_config_file, "r") as cfgfile:
        try:
            o_config = yaml.load(cfgfile, Loader=yaml.SafeLoader)
        except Exception as e:
            print("ERROR: Failed to parse configuration file: {}".format(e))
            os._exit(1)

    node_fqdn, node_hostname, node_domain, node_id = get_hostname()

    # Create the configuration dictionary
    config = dict()

    # Get the initial base configuration
    try:
        o_base = o_config["pvc"]
        o_cluster = o_config["pvc"]["cluster"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_general = {
        "node": o_base.get("node", node_hostname),
        "node_hostname": node_hostname,
        "node_fqdn": node_fqdn,
        "node_domain": node_domain,
        "node_id": node_id,
        "coordinators": o_cluster.get("coordinators", list()),
        "debug": o_base.get("debug", False),
    }

    config = {**config, **config_general}

    # Get the functions configuration
    try:
        o_functions = o_config["pvc"]["functions"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_functions = {
        "enable_hypervisor": o_functions.get("enable_hypervisor", False),
        "enable_networking": o_functions.get("enable_networking", False),
        "enable_storage": o_functions.get("enable_storage", False),
        "enable_api": o_functions.get("enable_api", False),
    }

    config = {**config, **config_functions}

    # Get the directory configuration
    try:
        o_directories = o_config["pvc"]["system"]["configuration"][
            "directories"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_directories = {
        "dynamic_directory": o_directories.get("dynamic_directory", None),
        "log_directory": o_directories.get("log_directory", None),
        "console_log_directory": o_directories.get("console_log_directory",
                                                   None),
    }

    # Define our dynamic directory schema
    config_directories["dnsmasq_dynamic_directory"] = (
        config_directories["dynamic_directory"] + "/dnsmasq")
    config_directories["pdns_dynamic_directory"] = (
        config_directories["dynamic_directory"] + "/pdns")
    config_directories["nft_dynamic_directory"] = (
        config_directories["dynamic_directory"] + "/nft")

    # Define our log directory schema
    config_directories["dnsmasq_log_directory"] = (
        config_directories["log_directory"] + "/dnsmasq")
    config_directories["pdns_log_directory"] = (
        config_directories["log_directory"] + "/pdns")
    config_directories["nft_log_directory"] = (
        config_directories["log_directory"] + "/nft")

    config = {**config, **config_directories}

    # Get the logging configuration
    try:
        o_logging = o_config["pvc"]["system"]["configuration"]["logging"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_logging = {
        "file_logging":
        o_logging.get("file_logging", False),
        "stdout_logging":
        o_logging.get("stdout_logging", False),
        "zookeeper_logging":
        o_logging.get("zookeeper_logging", False),
        "log_colours":
        o_logging.get("log_colours", False),
        "log_dates":
        o_logging.get("log_dates", False),
        "log_keepalives":
        o_logging.get("log_keepalives", False),
        "log_keepalive_cluster_details":
        o_logging.get("log_keepalive_cluster_details", False),
        "log_keepalive_storage_details":
        o_logging.get("log_keepalive_storage_details", False),
        "console_log_lines":
        o_logging.get("console_log_lines", False),
        "node_log_lines":
        o_logging.get("node_log_lines", False),
    }

    config = {**config, **config_logging}

    # Get the interval configuration
    try:
        o_intervals = o_config["pvc"]["system"]["intervals"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_intervals = {
        "vm_shutdown_timeout": int(o_intervals.get("vm_shutdown_timeout", 60)),
        "keepalive_interval": int(o_intervals.get("keepalive_interval", 5)),
        "fence_intervals": int(o_intervals.get("fence_intervals", 6)),
        "suicide_intervals": int(o_intervals.get("suicide_interval", 0)),
    }

    config = {**config, **config_intervals}

    # Get the fencing configuration
    try:
        o_fencing = o_config["pvc"]["system"]["fencing"]
        o_fencing_actions = o_fencing["actions"]
        o_fencing_ipmi = o_fencing["ipmi"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_fencing = {
        "successful_fence":
        o_fencing_actions.get("successful_fence", None),
        "failed_fence":
        o_fencing_actions.get("failed_fence", None),
        "ipmi_hostname":
        o_fencing_ipmi.get("host", f"{node_hostname}-lom.{node_domain}"),
        "ipmi_username":
        o_fencing_ipmi.get("user", "null"),
        "ipmi_password":
        o_fencing_ipmi.get("pass", "null"),
    }

    config = {**config, **config_fencing}

    # Get the migration configuration
    try:
        o_migration = o_config["pvc"]["system"]["migration"]
    except Exception as e:
        raise MalformedConfigurationError(e)

    config_migration = {
        "migration_target_selector": o_migration.get("target_selector", "mem"),
    }

    config = {**config, **config_migration}

    if config["enable_networking"]:
        # Get the node networks configuration
        try:
            o_networks = o_config["pvc"]["cluster"]["networks"]
            o_network_cluster = o_networks["cluster"]
            o_network_storage = o_networks["storage"]
            o_network_upstream = o_networks["upstream"]
            o_sysnetworks = o_config["pvc"]["system"]["configuration"][
                "networking"]
            o_sysnetwork_cluster = o_sysnetworks["cluster"]
            o_sysnetwork_storage = o_sysnetworks["storage"]
            o_sysnetwork_upstream = o_sysnetworks["upstream"]
        except Exception as e:
            raise MalformedConfigurationError(e)

        config_networks = {
            "cluster_domain": o_network_cluster.get("domain", None),
            "cluster_network": o_network_cluster.get("network", None),
            "cluster_floating_ip": o_network_cluster.get("floating_ip", None),
            "cluster_dev": o_sysnetwork_cluster.get("device", None),
            "cluster_mtu": o_sysnetwork_cluster.get("mtu", None),
            "cluster_dev_ip": o_sysnetwork_cluster.get("address", None),
            "storage_domain": o_network_storage.get("domain", None),
            "storage_network": o_network_storage.get("network", None),
            "storage_floating_ip": o_network_storage.get("floating_ip", None),
            "storage_dev": o_sysnetwork_storage.get("device", None),
            "storage_mtu": o_sysnetwork_storage.get("mtu", None),
            "storage_dev_ip": o_sysnetwork_storage.get("address", None),
            "upstream_domain": o_network_upstream.get("domain", None),
            "upstream_network": o_network_upstream.get("network", None),
            "upstream_floating_ip":
            o_network_upstream.get("floating_ip", None),
            "upstream_gateway": o_network_upstream.get("gateway", None),
            "upstream_dev": o_sysnetwork_upstream.get("device", None),
            "upstream_mtu": o_sysnetwork_upstream.get("mtu", None),
            "upstream_dev_ip": o_sysnetwork_upstream.get("address", None),
            "bridge_dev": o_sysnetworks.get("bridge_device", None),
            "bridge_mtu": o_sysnetworks.get("bridge_mtu", None),
            "enable_sriov": o_sysnetworks.get("sriov_enable", False),
            "sriov_device": o_sysnetworks.get("sriov_device", list()),
        }

        if config_networks["bridge_mtu"] is None:
            # Read the current MTU of bridge_dev and set bridge_mtu to it; avoids weird resets
            retcode, stdout, stderr = common.run_os_command(
                f"ip -json link show dev {config_networks['bridge_dev']}")
            current_bridge_mtu = loads(stdout)[0]["mtu"]
            print(
                f"Config key bridge_mtu not explicitly set; using live MTU {current_bridge_mtu} from {config_networks['bridge_dev']}"
            )
            config_networks["bridge_mtu"] = current_bridge_mtu

        config = {**config, **config_networks}

        for network_type in ["cluster", "storage", "upstream"]:
            result, msg = validate_floating_ip(config, network_type)
            if not result:
                raise MalformedConfigurationError(msg)

            address_key = "{}_dev_ip".format(network_type)
            network_key = f"{network_type}_network"
            network = ip_network(config[network_key])
            # With autoselection of addresses, construct an IP from the relevant network
            if config[address_key] == "by-id":
                # The NodeID starts at 1, but indexes start at 0
                address_id = int(config["node_id"]) - 1
                # Grab the nth address from the network
                config[address_key] = "{}/{}".format(
                    list(network.hosts())[address_id], network.prefixlen)
            # Validate the provided IP instead
            else:
                try:
                    address = ip_address(config[address_key].split("/")[0])
                    if address not in list(network.hosts()):
                        raise
                except Exception:
                    raise MalformedConfigurationError(
                        f"IP address {config[address_key]} for {address_key} is not valid"
                    )

        # Get the PowerDNS aggregator database configuration
        try:
            o_pdnsdb = o_config["pvc"]["coordinator"]["dns"]["database"]
        except Exception as e:
            raise MalformedConfigurationError(e)

        config_pdnsdb = {
            "pdns_postgresql_host": o_pdnsdb.get("host", None),
            "pdns_postgresql_port": o_pdnsdb.get("port", None),
            "pdns_postgresql_dbname": o_pdnsdb.get("name", None),
            "pdns_postgresql_user": o_pdnsdb.get("user", None),
            "pdns_postgresql_password": o_pdnsdb.get("pass", None),
        }

        config = {**config, **config_pdnsdb}

        # Get the Cloud-Init Metadata database configuration
        try:
            o_metadatadb = o_config["pvc"]["coordinator"]["metadata"][
                "database"]
        except Exception as e:
            raise MalformedConfigurationError(e)

        config_metadatadb = {
            "metadata_postgresql_host": o_metadatadb.get("host", None),
            "metadata_postgresql_port": o_metadatadb.get("port", None),
            "metadata_postgresql_dbname": o_metadatadb.get("name", None),
            "metadata_postgresql_user": o_metadatadb.get("user", None),
            "metadata_postgresql_password": o_metadatadb.get("pass", None),
        }

        config = {**config, **config_metadatadb}

    if config["enable_storage"]:
        # Get the storage configuration
        try:
            o_storage = o_config["pvc"]["system"]["configuration"]["storage"]
        except Exception as e:
            raise MalformedConfigurationError(e)

        config_storage = {
            "ceph_config_file": o_storage.get("ceph_config_file", None),
            "ceph_admin_keyring": o_storage.get("ceph_admin_keyring", None),
        }

        config = {**config, **config_storage}

        # Add our node static data to the config
        config["static_data"] = get_static_data()

    return config