def migrate_volumes(self, req): cmd = jsonobject.loads(req[http.REQUEST_BODY]) rsp = AgentRsp() for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath) with lvm.OperateLv(current_abs_path, shared=True): virtual_size = lvm.get_lv_size(current_abs_path) if lvm.lv_exists(target_abs_path): target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath) raise Exception("found %s already exists on ps %s" % (target_abs_path, target_ps_uuid)) lvm.create_lv_from_absolute_path(target_abs_path, virtual_size, "%s::%s::%s" % (VOLUME_TAG, cmd.hostUuid, time.time())) lvm.active_lv(target_abs_path, lvm.LvmlockdLockType.SHARE) try: for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath) with lvm.OperateLv(current_abs_path, shared=True): bash.bash_errorout("cp %s %s" % (current_abs_path, target_abs_path)) for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath) with lvm.RecursiveOperateLv(current_abs_path, shared=True): previous_ps_uuid = get_primary_storage_uuid_from_install_path(struct.currentInstallPath) target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath) current_backing_file = linux.qcow2_get_backing_file(current_abs_path) # type: str target_backing_file = current_backing_file.replace(previous_ps_uuid, target_ps_uuid) if current_backing_file is not None and current_backing_file != "": lvm.do_active_lv(target_backing_file, lvm.LvmlockdLockType.SHARE, False) logger.debug("rebase %s to %s" % (target_abs_path, target_backing_file)) linux.qcow2_rebase_no_check(target_backing_file, target_abs_path) if struct.compareQcow2: bash.bash_errorout("time qemu-img compare %s %s" % (current_abs_path, target_abs_path)) except Exception as e: for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) if struct.currentInstallPath == struct.targetInstallPath: logger.debug("current install path %s equals target %s, skip to delete" % (struct.currentInstallPath, struct.targetInstallPath)) else: logger.debug("error happened, delete lv %s" % target_abs_path) lvm.delete_lv(target_abs_path, False) raise e finally: for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) lvm.deactive_lv(target_abs_path) rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid) return jsonobject.dumps(rsp)
def deactive_lvs_on_vg(vgUuid): active_lvs = lvm.list_local_active_lvs(vgUuid) if len(active_lvs) == 0: return logger.warn("active lvs %s will be deactivate" % active_lvs) lvm.deactive_lv(vgUuid) active_lvs = lvm.list_local_active_lvs(vgUuid) if len(active_lvs) != 0: raise RetryException("lvs [%s] still active, retry deactive again" % active_lvs)
def deactive_drbd_resouces_on_vg(vgUuid): active_lvs = lvm.list_local_active_lvs(vgUuid) if len(active_lvs) == 0: return drbd_resources = [ drbd.DrbdResource(lv.split("/")[-1]) for lv in active_lvs ] for r in drbd_resources: r.destroy() logger.warn("active lvs %s will be deactivate" % active_lvs) lvm.deactive_lv(vgUuid) active_lvs = lvm.list_local_active_lvs(vgUuid) if len(active_lvs) != 0: raise RetryException( "lvs [%s] still active, retry deactive again" % active_lvs)
def handle_lv(lockType, fpath): if lockType > lvm.LvmlockdLockType.NULL: lvm.active_lv(fpath, lockType == lvm.LvmlockdLockType.SHARE) else: try: lvm.deactive_lv(fpath) except Exception as e: if not killProcess: return qemus = lvm.find_qemu_for_lv_in_use(fpath) if len(qemus) == 0: return for qemu in qemus: if qemu.state != "running": linux.kill_process(qemu.pid) lvm.deactive_lv(fpath)
def disconnect(self, req): cmd = jsonobject.loads(req[http.REQUEST_BODY]) rsp = AgentRsp() @linux.retry(times=3, sleep_time=random.uniform(0.1, 3)) def find_vg(vgUuid): cmd = shell.ShellCmd("vgs %s -otags | grep %s" % (vgUuid, INIT_TAG)) cmd(is_exception=False) if cmd.return_code == 0: return True logger.debug("can not find vg %s with tag %s" % (vgUuid, INIT_TAG)) cmd = shell.ShellCmd("vgs %s" % vgUuid) cmd(is_exception=False) if cmd.return_code == 0: logger.warn("found vg %s without tag %s" % (vgUuid, INIT_TAG)) return True raise RetryException("can not find vg %s with or without tag %s" % (vgUuid, INIT_TAG)) try: find_vg(cmd.vgUuid) except RetryException: logger.debug("can not find vg %s; return success" % cmd.vgUuid) return jsonobject.dumps(rsp) except Exception as e: raise e active_lvs = lvm.list_local_active_lvs(cmd.vgUuid) if len(active_lvs) != 0: logger.warn("active lvs %s will be deactivate" % active_lvs) lvm.deactive_lv(cmd.vgUuid) lvm.clean_vg_exists_host_tags(cmd.vgUuid, cmd.hostUuid, HEARTBEAT_TAG) lvm.stop_vg_lock(cmd.vgUuid) return jsonobject.dumps(rsp)
def heartbeat_on_sharedblock(): fire = 0 failure = 0 while self.run_fencer(cmd.vgUuid, created_time): try: time.sleep(cmd.interval) global last_multipath_run if cmd.fail_if_no_path and time.time( ) - last_multipath_run > 3600: last_multipath_run = time.time() thread.ThreadFacade.run_in_thread( linux.set_fail_if_no_path) health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False) logger.debug( "sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health)) if health[0] is True: fire = 0 failure = 0 continue failure += 1 if failure < cmd.maxAttempts: continue if self.fencer_fire_timestamp.get(cmd.vgUuid) is not None and \ time.time() > self.fencer_fire_timestamp.get(cmd.vgUuid) and \ time.time() - self.fencer_fire_timestamp.get(cmd.vgUuid) < (300 * (fire + 1 if fire < 10 else 10)): logger.warn( "last fencer fire: %s, now: %s, passed: %s seconds, within %s seconds, skip fire", self.fencer_fire_timestamp[cmd.vgUuid], time.time(), time.time() - self.fencer_fire_timestamp.get(cmd.vgUuid), 300 * (fire + 1 if fire < 10 else 10)) failure = 0 continue self.fencer_fire_timestamp[cmd.vgUuid] = time.time() try: logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid) self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1]) fire += 1 if cmd.strategy == 'Permissive': continue # we will check one qcow2 per pv to determine volumes on pv should be kill invalid_pv_uuids = lvm.get_invalid_pv_uuids( cmd.vgUuid, cmd.checkIo) vms = lvm.get_running_vm_root_volume_on_pv( cmd.vgUuid, invalid_pv_uuids, True) killed_vm_uuids = [] for vm in vms: kill = shell.ShellCmd('kill -9 %s' % vm.pid) kill(False) if kill.return_code == 0: logger.warn( 'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.' 'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts)) killed_vm_uuids.append(vm.uuid) else: logger.warn( 'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr)) for volume in vm.volumes: used_process = linux.linux_lsof(volume) if len(used_process) == 0: try: lvm.deactive_lv(volume, False) except Exception as e: logger.debug( "deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message)) content = traceback.format_exc() logger.warn("traceback: %s" % content) else: logger.debug( "volume %s still used: %s, skip to deactivate" % (volume, used_process)) if len(killed_vm_uuids) != 0: self.report_self_fencer_triggered( [cmd.vgUuid], ','.join(killed_vm_uuids)) clean_network_config(killed_vm_uuids) lvm.remove_partial_lv_dm(cmd.vgUuid) if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) lvm.remove_device_map_for_vg(cmd.vgUuid) except Exception as e: logger.warn("kill vm failed, %s" % e.message) content = traceback.format_exc() logger.warn("traceback: %s" % content) finally: failure = 0 except Exception as e: logger.debug( 'self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...' % cmd.vgUuid) content = traceback.format_exc() logger.warn(content) if not self.run_fencer(cmd.vgUuid, created_time): logger.debug( 'stop self-fencer on sharedblock primary storage %s for judger failed' % cmd.vgUuid) else: logger.warn( 'stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)
def handle_lv(lockType, fpath): if lockType > lvm.LvmlockdLockType.NULL: lvm.active_lv(fpath, lockType == lvm.LvmlockdLockType.SHARE) else: lvm.deactive_lv(fpath)
def migrate_volumes(self, req): cmd = jsonobject.loads(req[http.REQUEST_BODY]) rsp = AgentRsp() for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath) with lvm.OperateLv(current_abs_path, shared=True): lv_size = lvm.get_lv_size(current_abs_path) if lvm.lv_exists(target_abs_path): target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath) raise Exception("found %s already exists on ps %s" % (target_abs_path, target_ps_uuid)) lvm.create_lv_from_absolute_path(target_abs_path, lvm.getOriginalSize(lv_size), "%s::%s::%s" % (VOLUME_TAG, cmd.hostUuid, time.time())) lvm.active_lv(target_abs_path, lvm.LvmlockdLockType.SHARE) try: for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath) with lvm.OperateLv(current_abs_path, shared=True): bash.bash_errorout("cp %s %s" % (current_abs_path, target_abs_path)) for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) current_abs_path = translate_absolute_path_from_install_path(struct.currentInstallPath) with lvm.RecursiveOperateLv(current_abs_path, shared=True): previous_ps_uuid = get_primary_storage_uuid_from_install_path(struct.currentInstallPath) target_ps_uuid = get_primary_storage_uuid_from_install_path(struct.targetInstallPath) current_backing_file = linux.qcow2_get_backing_file(current_abs_path) # type: str target_backing_file = current_backing_file.replace(previous_ps_uuid, target_ps_uuid) if struct.compareQcow2: logger.debug("comparing qcow2 between %s and %s" % (current_abs_path, target_abs_path)) if not self.compare(current_abs_path, target_abs_path): raise Exception("qcow2 %s and %s are not identical" % (current_abs_path, target_abs_path)) logger.debug("confirmed qcow2 %s and %s are identical" % (current_abs_path, target_abs_path)) if current_backing_file is not None and current_backing_file != "": lvm.do_active_lv(target_backing_file, lvm.LvmlockdLockType.SHARE, False) logger.debug("rebase %s to %s" % (target_abs_path, target_backing_file)) linux.qcow2_rebase_no_check(target_backing_file, target_abs_path) except Exception as e: for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) if struct.currentInstallPath == struct.targetInstallPath: logger.debug("current install path %s equals target %s, skip to delete" % (struct.currentInstallPath, struct.targetInstallPath)) else: logger.debug("error happened, delete lv %s" % target_abs_path) lvm.delete_lv(target_abs_path, False) raise e finally: for struct in cmd.migrateVolumeStructs: target_abs_path = translate_absolute_path_from_install_path(struct.targetInstallPath) lvm.deactive_lv(target_abs_path) rsp.totalCapacity, rsp.availableCapacity = lvm.get_vg_size(cmd.vgUuid) return jsonobject.dumps(rsp)
def heartbeat_on_sharedblock(): failure = 0 while self.run_sharedblock_fencer[cmd.vgUuid] is True: try: time.sleep(cmd.interval) health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False) logger.debug( "sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health)) if health[0] is True: failure = 0 continue failure += 1 if failure < cmd.maxAttempts: continue try: logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid) self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1]) # we will check one qcow2 per pv to determine volumes on pv should be kill invalid_pv_uuids = lvm.get_invalid_pv_uuids( cmd.vgUuid, cmd.checkIo) vms = lvm.get_running_vm_root_volume_on_pv( cmd.vgUuid, invalid_pv_uuids, cmd.checkIo) for vm in vms: kill = shell.ShellCmd('kill -9 %s' % vm.pid) kill(False) if kill.return_code == 0: logger.warn( 'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.' 'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts)) else: logger.warn( 'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr)) for volume in vm.volumes: used_process = linux.linux_lsof(volume) if len(used_process) == 0: try: lvm.deactive_lv(volume, False) except Exception as e: logger.debug( "deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message)) content = traceback.format_exc() logger.warn("traceback: %s" % content) else: logger.debug( "volume %s still used: %s, skip to deactivate" % (volume, used_process)) lvm.remove_partial_lv_dm(cmd.vgUuid) if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) lvm.remove_device_map_for_vg(cmd.vgUuid) # reset the failure count failure = 0 except Exception as e: logger.warn("kill vm failed, %s" % e.message) content = traceback.format_exc() logger.warn("traceback: %s" % content) except Exception as e: logger.debug( 'self-fencer on sharedblock primary storage %s stopped abnormally' % cmd.vgUuid) content = traceback.format_exc() logger.warn(content) logger.debug('stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)
def heartbeat_on_sharedblock(): failure = 0 while self.run_fencer(cmd.vgUuid, created_time): try: time.sleep(cmd.interval) health = lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, check_pv=False) logger.debug("sharedblock group primary storage %s fencer run result: %s" % (cmd.vgUuid, health)) if health[0] is True: failure = 0 continue failure += 1 if failure < cmd.maxAttempts: continue try: logger.warn("shared block storage %s fencer fired!" % cmd.vgUuid) self.report_storage_status([cmd.vgUuid], 'Disconnected', health[1]) # we will check one qcow2 per pv to determine volumes on pv should be kill invalid_pv_uuids = lvm.get_invalid_pv_uuids(cmd.vgUuid, cmd.checkIo) vms = lvm.get_running_vm_root_volume_on_pv(cmd.vgUuid, invalid_pv_uuids, True) killed_vm_uuids = [] for vm in vms: kill = shell.ShellCmd('kill -9 %s' % vm.pid) kill(False) if kill.return_code == 0: logger.warn( 'kill the vm[uuid:%s, pid:%s] because we lost connection to the storage.' 'failed to run health check %s times' % (vm.uuid, vm.pid, cmd.maxAttempts)) killed_vm_uuids.append(vm.uuid) else: logger.warn( 'failed to kill the vm[uuid:%s, pid:%s] %s' % (vm.uuid, vm.pid, kill.stderr)) for volume in vm.volumes: used_process = linux.linux_lsof(volume) if len(used_process) == 0: try: lvm.deactive_lv(volume, False) except Exception as e: logger.debug("deactivate volume %s for vm %s failed, %s" % (volume, vm.uuid, e.message)) content = traceback.format_exc() logger.warn("traceback: %s" % content) else: logger.debug("volume %s still used: %s, skip to deactivate" % (volume, used_process)) if len(killed_vm_uuids) != 0: self.report_self_fencer_triggered([cmd.vgUuid], ','.join(killed_vm_uuids)) lvm.remove_partial_lv_dm(cmd.vgUuid) if lvm.check_vg_status(cmd.vgUuid, cmd.storageCheckerTimeout, True)[0] is False: lvm.drop_vg_lock(cmd.vgUuid) lvm.remove_device_map_for_vg(cmd.vgUuid) # reset the failure count failure = 0 except Exception as e: logger.warn("kill vm failed, %s" % e.message) content = traceback.format_exc() logger.warn("traceback: %s" % content) except Exception as e: logger.debug('self-fencer on sharedblock primary storage %s stopped abnormally, try again soon...' % cmd.vgUuid) content = traceback.format_exc() logger.warn(content) if not self.run_fencer(cmd.vgUuid, created_time): logger.debug('stop self-fencer on sharedblock primary storage %s for judger failed' % cmd.vgUuid) else: logger.warn('stop self-fencer on sharedblock primary storage %s' % cmd.vgUuid)