Exemplo n.º 1
0
    def migrate_volumes(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        rsp = AgentRsp()

        for struct in cmd.migrateVolumeStructs:
            target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
            current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath)
            with lvm.OperateLv(current_abs_path, shared=True):
                virtual_size = lvm.get_lv_size(current_abs_path)

                if lvm.lv_exists(target_abs_path):
                    target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath)
                    raise Exception("found %s already exists on ps %s" %
                                    (target_abs_path, target_ps_uuid))
                lvm.create_lv_from_absolute_path(target_abs_path, virtual_size,
                                                     "%s::%s::%s" % (VOLUME_TAG, cmd.hostUuid, time.time()))
                lvm.active_lv(target_abs_path, lvm.LvmlockdLockType.SHARE)

        try:
            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath)

                with lvm.OperateLv(current_abs_path, shared=True):
                    bash.bash_errorout("cp %s %s" % (current_abs_path, target_abs_path))

            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath)
                with lvm.RecursiveOperateLv(current_abs_path, shared=True):
                    previous_ps_uuid = get_primary_storage_uuid_from_install_path(struct.currentInstallPath)
                    target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath)

                    current_backing_file = linux.qcow2_get_backing_file(current_abs_path)  # type: str
                    target_backing_file = current_backing_file.replace(previous_ps_uuid, target_ps_uuid)

                    if current_backing_file is not None and current_backing_file != "":
                        lvm.do_active_lv(target_backing_file, lvm.LvmlockdLockType.SHARE, False)
                        logger.debug("rebase %s to %s" % (target_abs_path, target_backing_file))
                        linux.qcow2_rebase_no_check(target_backing_file, target_abs_path)
                    if struct.compareQcow2:
                        bash.bash_errorout("time qemu-img compare %s %s" % (current_abs_path, target_abs_path))
        except Exception as e:
            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                if struct.currentInstallPath == struct.targetInstallPath:
                    logger.debug("current install path %s equals target %s, skip to delete" %
                                 (struct.currentInstallPath, struct.targetInstallPath))
                else:
                    logger.debug("error happened, delete lv %s" % target_abs_path)
                    lvm.delete_lv(target_abs_path, False)
            raise e
        finally:
            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                lvm.deactive_lv(target_abs_path)

        rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid)
        return jsonobject.dumps(rsp)
 def deactive_lvs_on_vg(vgUuid):
     active_lvs = lvm.list_local_active_lvs(vgUuid)
     if len(active_lvs) == 0:
         return
     logger.warn("active lvs %s will be deactivate" % active_lvs)
     lvm.deactive_lv(vgUuid)
     active_lvs = lvm.list_local_active_lvs(vgUuid)
     if len(active_lvs) != 0:
         raise RetryException("lvs [%s] still active, retry deactive again" % active_lvs)
Exemplo n.º 3
0
 def deactive_lvs_on_vg(vgUuid):
     active_lvs = lvm.list_local_active_lvs(vgUuid)
     if len(active_lvs) == 0:
         return
     logger.warn("active lvs %s will be deactivate" % active_lvs)
     lvm.deactive_lv(vgUuid)
     active_lvs = lvm.list_local_active_lvs(vgUuid)
     if len(active_lvs) != 0:
         raise RetryException("lvs [%s] still active, retry deactive again" % active_lvs)
Exemplo n.º 4
0
 def deactive_drbd_resouces_on_vg(vgUuid):
     active_lvs = lvm.list_local_active_lvs(vgUuid)
     if len(active_lvs) == 0:
         return
     drbd_resources = [
         drbd.DrbdResource(lv.split("/")[-1]) for lv in active_lvs
     ]
     for r in drbd_resources:
         r.destroy()
     logger.warn("active lvs %s will be deactivate" % active_lvs)
     lvm.deactive_lv(vgUuid)
     active_lvs = lvm.list_local_active_lvs(vgUuid)
     if len(active_lvs) != 0:
         raise RetryException(
             "lvs [%s] still active, retry deactive again" % active_lvs)
Exemplo n.º 5
0
 def handle_lv(lockType, fpath):
     if lockType > lvm.LvmlockdLockType.NULL:
         lvm.active_lv(fpath, lockType == lvm.LvmlockdLockType.SHARE)
     else:
         try:
             lvm.deactive_lv(fpath)
         except Exception as e:
             if not killProcess:
                 return
             qemus = lvm.find_qemu_for_lv_in_use(fpath)
             if len(qemus) == 0:
                 return
             for qemu in qemus:
                 if qemu.state != "running":
                     linux.kill_process(qemu.pid)
             lvm.deactive_lv(fpath)
Exemplo n.º 6
0
    def disconnect(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        rsp = AgentRsp()

        @linux.retry(times=3, sleep_time=random.uniform(0.1, 3))
        def find_vg(vgUuid):
            cmd = shell.ShellCmd("vgs %s -otags | grep %s" %
                                 (vgUuid, INIT_TAG))
            cmd(is_exception=False)
            if cmd.return_code == 0:
                return True

            logger.debug("can not find vg %s with tag %s" % (vgUuid, INIT_TAG))
            cmd = shell.ShellCmd("vgs %s" % vgUuid)
            cmd(is_exception=False)
            if cmd.return_code == 0:
                logger.warn("found vg %s without tag %s" % (vgUuid, INIT_TAG))
                return True

            raise RetryException("can not find vg %s with or without tag %s" %
                                 (vgUuid, INIT_TAG))

        try:
            find_vg(cmd.vgUuid)
        except RetryException:
            logger.debug("can not find vg %s; return success" % cmd.vgUuid)
            return jsonobject.dumps(rsp)
        except Exception as e:
            raise e

        active_lvs = lvm.list_local_active_lvs(cmd.vgUuid)
        if len(active_lvs) != 0:
            logger.warn("active lvs %s will be deactivate" % active_lvs)
        lvm.deactive_lv(cmd.vgUuid)
        lvm.clean_vg_exists_host_tags(cmd.vgUuid, cmd.hostUuid, HEARTBEAT_TAG)
        lvm.stop_vg_lock(cmd.vgUuid)
        return jsonobject.dumps(rsp)
Exemplo n.º 7
0
        def heartbeat_on_sharedblock():
            fire = 0
            failure = 0

            while self.run_fencer(cmd.vgUuid, created_time):
                try:
                    time.sleep(cmd.interval)
                    global last_multipath_run
                    if cmd.fail_if_no_path and time.time(
                    ) - last_multipath_run > 3600:
                        last_multipath_run = time.time()
                        thread.ThreadFacade.run_in_thread(
                            linux.set_fail_if_no_path)

                    health = lvm.check_vg_status(cmd.vgUuid,
                                                 cmd.storageCheckerTimeout,
                                                 check_pv=False)
                    logger.debug(
                        "sharedblock group primary storage %s fencer run result: %s"
                        % (cmd.vgUuid, health))
                    if health[0] is True:
                        fire = 0
                        failure = 0
                        continue

                    failure += 1
                    if failure < cmd.maxAttempts:
                        continue

                    if self.fencer_fire_timestamp.get(cmd.vgUuid) is not None and \
                            time.time() > self.fencer_fire_timestamp.get(cmd.vgUuid) and \
                            time.time() - self.fencer_fire_timestamp.get(cmd.vgUuid) < (300 * (fire + 1 if fire < 10 else 10)):
                        logger.warn(
                            "last fencer fire: %s, now: %s, passed: %s seconds, within %s seconds, skip fire",
                            self.fencer_fire_timestamp[cmd.vgUuid],
                            time.time(),
                            time.time() -
                            self.fencer_fire_timestamp.get(cmd.vgUuid),
                            300 * (fire + 1 if fire < 10 else 10))
                        failure = 0
                        continue

                    self.fencer_fire_timestamp[cmd.vgUuid] = time.time()
                    try:
                        logger.warn("shared block storage %s fencer fired!" %
                                    cmd.vgUuid)
                        self.report_storage_status([cmd.vgUuid],
                                                   'Disconnected', health[1])
                        fire += 1

                        if cmd.strategy == 'Permissive':
                            continue

                        # we will check one qcow2 per pv to determine volumes on pv should be kill
                        invalid_pv_uuids = lvm.get_invalid_pv_uuids(
                            cmd.vgUuid, cmd.checkIo)
                        vms = lvm.get_running_vm_root_volume_on_pv(
                            cmd.vgUuid, invalid_pv_uuids, True)
                        killed_vm_uuids = []
                        for vm in vms:
                            kill = shell.ShellCmd('kill -9 %s' % vm.pid)
                            kill(False)
                            if kill.return_code == 0:
                                logger.warn(
                                    'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.'
                                    'failed to run health check %s times' %
                                    (vm.uuid, vm.pid, cmd.maxAttempts))
                                killed_vm_uuids.append(vm.uuid)
                            else:
                                logger.warn(
                                    'failed to kill the vm[uuid:%s, pid:%s] %s'
                                    % (vm.uuid, vm.pid, kill.stderr))

                            for volume in vm.volumes:
                                used_process = linux.linux_lsof(volume)
                                if len(used_process) == 0:
                                    try:
                                        lvm.deactive_lv(volume, False)
                                    except Exception as e:
                                        logger.debug(
                                            "deactivate volume %s for vm %s failed, %s"
                                            % (volume, vm.uuid, e.message))
                                        content = traceback.format_exc()
                                        logger.warn("traceback: %s" % content)
                                else:
                                    logger.debug(
                                        "volume %s still used: %s, skip to deactivate"
                                        % (volume, used_process))

                        if len(killed_vm_uuids) != 0:
                            self.report_self_fencer_triggered(
                                [cmd.vgUuid], ','.join(killed_vm_uuids))
                            clean_network_config(killed_vm_uuids)

                        lvm.remove_partial_lv_dm(cmd.vgUuid)

                        if lvm.check_vg_status(cmd.vgUuid,
                                               cmd.storageCheckerTimeout,
                                               True)[0] is False:
                            lvm.drop_vg_lock(cmd.vgUuid)
                            lvm.remove_device_map_for_vg(cmd.vgUuid)

                    except Exception as e:
                        logger.warn("kill vm failed, %s" % e.message)
                        content = traceback.format_exc()
                        logger.warn("traceback: %s" % content)
                    finally:
                        failure = 0

                except Exception as e:
                    logger.debug(
                        'self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...'
                        % cmd.vgUuid)
                    content = traceback.format_exc()
                    logger.warn(content)

            if not self.run_fencer(cmd.vgUuid, created_time):
                logger.debug(
                    'stop self-fencer on sharedblock primary storage %s for judger failed'
                    % cmd.vgUuid)
            else:
                logger.warn(
                    'stop self-fencer on sharedblock primary storage %s' %
                    cmd.vgUuid)
Exemplo n.º 8
0
 def handle_lv(lockType, fpath):
     if lockType > lvm.LvmlockdLockType.NULL:
         lvm.active_lv(fpath, lockType == lvm.LvmlockdLockType.SHARE)
     else:
         lvm.deactive_lv(fpath)
    def migrate_volumes(self, req):
        cmd = jsonobject.loads(req[http.REQUEST_BODY])
        rsp = AgentRsp()

        for struct in cmd.migrateVolumeStructs:
            target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
            current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath)
            with lvm.OperateLv(current_abs_path, shared=True):
                lv_size = lvm.get_lv_size(current_abs_path)

                if lvm.lv_exists(target_abs_path):
                    target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath)
                    raise Exception("found %s already exists on ps %s" %
                                    (target_abs_path, target_ps_uuid))
                lvm.create_lv_from_absolute_path(target_abs_path, lvm.getOriginalSize(lv_size),
                                                     "%s::%s::%s" % (VOLUME_TAG, cmd.hostUuid, time.time()))
                lvm.active_lv(target_abs_path, lvm.LvmlockdLockType.SHARE)

        try:
            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath)

                with lvm.OperateLv(current_abs_path, shared=True):
                    bash.bash_errorout("cp %s %s" % (current_abs_path, target_abs_path))

            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath)
                with lvm.RecursiveOperateLv(current_abs_path, shared=True):
                    previous_ps_uuid = get_primary_storage_uuid_from_install_path(struct.currentInstallPath)
                    target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath)

                    current_backing_file = linux.qcow2_get_backing_file(current_abs_path)  # type: str
                    target_backing_file = current_backing_file.replace(previous_ps_uuid, target_ps_uuid)

                    if struct.compareQcow2:
                        logger.debug("comparing qcow2 between %s and %s" % (current_abs_path, target_abs_path))
                        if not self.compare(current_abs_path, target_abs_path):
                            raise Exception("qcow2 %s and %s are not identical" % (current_abs_path, target_abs_path))
                        logger.debug("confirmed qcow2 %s and %s are identical" % (current_abs_path, target_abs_path))
                    if current_backing_file is not None and current_backing_file != "":
                        lvm.do_active_lv(target_backing_file, lvm.LvmlockdLockType.SHARE, False)
                        logger.debug("rebase %s to %s" % (target_abs_path, target_backing_file))
                        linux.qcow2_rebase_no_check(target_backing_file, target_abs_path)
        except Exception as e:
            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                if struct.currentInstallPath == struct.targetInstallPath:
                    logger.debug("current install path %s equals target %s, skip to delete" %
                                 (struct.currentInstallPath, struct.targetInstallPath))
                else:
                    logger.debug("error happened, delete lv %s" % target_abs_path)
                    lvm.delete_lv(target_abs_path, False)
            raise e
        finally:
            for struct in cmd.migrateVolumeStructs:
                target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath)
                lvm.deactive_lv(target_abs_path)

        rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid)
        return jsonobject.dumps(rsp)
Exemplo n.º 10
0
 def handle_lv(lockType, fpath):
     if lockType > lvm.LvmlockdLockType.NULL:
         lvm.active_lv(fpath, lockType == lvm.LvmlockdLockType.SHARE)
     else:
         lvm.deactive_lv(fpath)
Exemplo n.º 11
0
        def heartbeat_on_sharedblock():
            failure = 0

            while self.run_sharedblock_fencer[cmd.vgUuid] is True:
                try:
                    time.sleep(cmd.interval)

                    health = lvm.check_vg_status(cmd.vgUuid,
                                                 cmd.storageCheckerTimeout,
                                                 check_pv=False)
                    logger.debug(
                        "sharedblock group primary storage %s fencer run result: %s"
                        % (cmd.vgUuid, health))
                    if health[0] is True:
                        failure = 0
                        continue

                    failure += 1
                    if failure < cmd.maxAttempts:
                        continue

                    try:
                        logger.warn("shared block storage %s fencer fired!" %
                                    cmd.vgUuid)
                        self.report_storage_status([cmd.vgUuid],
                                                   'Disconnected', health[1])

                        # we will check one qcow2 per pv to determine volumes on pv should be kill
                        invalid_pv_uuids = lvm.get_invalid_pv_uuids(
                            cmd.vgUuid, cmd.checkIo)
                        vms = lvm.get_running_vm_root_volume_on_pv(
                            cmd.vgUuid, invalid_pv_uuids, cmd.checkIo)
                        for vm in vms:
                            kill = shell.ShellCmd('kill -9 %s' % vm.pid)
                            kill(False)
                            if kill.return_code == 0:
                                logger.warn(
                                    'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.'
                                    'failed to run health check %s times' %
                                    (vm.uuid, vm.pid, cmd.maxAttempts))

                            else:
                                logger.warn(
                                    'failed to kill the vm[uuid:%s, pid:%s] %s'
                                    % (vm.uuid, vm.pid, kill.stderr))

                            for volume in vm.volumes:
                                used_process = linux.linux_lsof(volume)
                                if len(used_process) == 0:
                                    try:
                                        lvm.deactive_lv(volume, False)
                                    except Exception as e:
                                        logger.debug(
                                            "deactivate volume %s for vm %s failed, %s"
                                            % (volume, vm.uuid, e.message))
                                        content = traceback.format_exc()
                                        logger.warn("traceback: %s" % content)
                                else:
                                    logger.debug(
                                        "volume %s still used: %s, skip to deactivate"
                                        % (volume, used_process))

                        lvm.remove_partial_lv_dm(cmd.vgUuid)

                        if lvm.check_vg_status(cmd.vgUuid,
                                               cmd.storageCheckerTimeout,
                                               True)[0] is False:
                            lvm.drop_vg_lock(cmd.vgUuid)
                            lvm.remove_device_map_for_vg(cmd.vgUuid)

                        # reset the failure count
                        failure = 0
                    except Exception as e:
                        logger.warn("kill vm failed, %s" % e.message)
                        content = traceback.format_exc()
                        logger.warn("traceback: %s" % content)

                except Exception as e:
                    logger.debug(
                        'self-fencer on sharedblock primary storage %s stopped abnormally'
                        % cmd.vgUuid)
                    content = traceback.format_exc()
                    logger.warn(content)

            logger.debug('stop self-fencer on sharedblock primary storage %s' %
                         cmd.vgUuid)
Exemplo n.º 12
0
        def heartbeat_on_sharedblock():
            failure = 0

            while self.run_fencer(cmd.vgUuid, created_time):
                try:
                    time.sleep(cmd.interval)

                    health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False)
                    logger.debug("sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health))
                    if health[0] is True:
                        failure = 0
                        continue

                    failure += 1
                    if failure < cmd.maxAttempts:
                        continue

                    try:
                        logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid)
                        self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1])

                        # we will check one qcow2 per pv to determine volumes on pv should be kill
                        invalid_pv_uuids = lvm.get_invalid_pv_uuids(cmd.vgUuid, cmd.checkIo)
                        vms = lvm.get_running_vm_root_volume_on_pv(cmd.vgUuid, invalid_pv_uuids, True)
                        killed_vm_uuids = []
                        for vm in vms:
                            kill = shell.ShellCmd('kill -9 %s' % vm.pid)
                            kill(False)
                            if kill.return_code == 0:
                                logger.warn(
                                    'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.'
                                    'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts))
                                killed_vm_uuids.append(vm.uuid)
                            else:
                                logger.warn(
                                    'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr))

                            for volume in vm.volumes:
                                used_process = linux.linux_lsof(volume)
                                if len(used_process) == 0:
                                    try:
                                        lvm.deactive_lv(volume, False)
                                    except Exception as e:
                                        logger.debug("deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message))
                                        content = traceback.format_exc()
                                        logger.warn("traceback: %s" % content)
                                else:
                                    logger.debug("volume %s still used: %s, skip to deactivate" % (volume, used_process))

                        if len(killed_vm_uuids) != 0:
                            self.report_self_fencer_triggered([cmd.vgUuid], ','.join(killed_vm_uuids))
                        lvm.remove_partial_lv_dm(cmd.vgUuid)

                        if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False:
                            lvm.drop_vg_lock(cmd.vgUuid)
                            lvm.remove_device_map_for_vg(cmd.vgUuid)

                        # reset the failure count
                        failure = 0
                    except Exception as e:
                        logger.warn("kill vm failed, %s" % e.message)
                        content = traceback.format_exc()
                        logger.warn("traceback: %s" % content)

                except Exception as e:
                    logger.debug('self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...' % cmd.vgUuid)
                    content = traceback.format_exc()
                    logger.warn(content)

            if not self.run_fencer(cmd.vgUuid, created_time):
                logger.debug('stop self-fencer on sharedblock primary storage %s for judger failed' % cmd.vgUuid)
            else:
                logger.warn('stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)