def _recoverExistingVms(self): try: # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuTopology().cores()) migration.SourceThread.setMaxOutgoingMigrations(mog) # Recover for v in getVDSMDomains(): vmId = v.UUIDString() if not self._recoverVm(vmId): # RH qemu proc without recovery self.log.info( 'loose qemu process with id: ' '%s found, killing it.', vmId) try: v.destroy() except libvirt.libvirtError: self.log.error( 'failed to kill loose qemu ' 'process with id: %s', vmId, exc_info=True) # we do this to safely handle VMs which disappeared # from the host while VDSM was down/restarting recVms = self._getVDSMVmsFromRecovery() if recVms: self.log.warning( 'Found %i VMs from recovery files not' ' reported by libvirt.' ' This should not happen!' ' Will try to recover them.', len(recVms)) for vmId in recVms: if not self._recoverVm(vmId): self.log.warning( 'VM %s failed to recover from recovery' ' file, reported as Down', vmId) while (self._enabled and vmstatus.WAIT_FOR_LAUNCH in [v.lastStatus for v in self.vmContainer.values()]): time.sleep(1) self._cleanOldFiles() self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations while self._enabled and self.vmContainer and \ not self.irs.getConnectedStoragePoolsList()['poollist']: time.sleep(5) for vmId, vmObj in self.vmContainer.items(): # Let's recover as much VMs as possible try: # Do not prepare volumes when system goes down if self._enabled: vmObj.preparePaths( vmObj.buildConfDevices()[vm.DISK_DEVICES]) except: self.log.error("Vm %s recovery failed", vmId, exc_info=True) except: self.log.error("Vm's recovery failed", exc_info=True) raise
def _recoverExistingVms(self): try: # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuTopology().cores()) migration.SourceThread.setMaxOutgoingMigrations(mog) # Recover for v in getVDSMDomains(): vmId = v.UUIDString() if not self._recoverVm(vmId): # RH qemu proc without recovery self.log.info('loose qemu process with id: ' '%s found, killing it.', vmId) try: v.destroy() except libvirt.libvirtError: self.log.error('failed to kill loose qemu ' 'process with id: %s', vmId, exc_info=True) # we do this to safely handle VMs which disappeared # from the host while VDSM was down/restarting recVms = self._getVDSMVmsFromRecovery() if recVms: self.log.warning('Found %i VMs from recovery files not' ' reported by libvirt.' ' This should not happen!' ' Will try to recover them.', len(recVms)) for vmId in recVms: if not self._recoverVm(vmId): self.log.warning('VM %s failed to recover from recovery' ' file, reported as Down', vmId) while (self._enabled and vmstatus.WAIT_FOR_LAUNCH in [v.lastStatus for v in self.vmContainer.values()]): time.sleep(1) self._cleanOldFiles() self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations while self._enabled and self.vmContainer and \ not self.irs.getConnectedStoragePoolsList()['poollist']: time.sleep(5) for vmId, vmObj in self.vmContainer.items(): # Let's recover as much VMs as possible try: # Do not prepare volumes when system goes down if self._enabled: vmObj.preparePaths( vmObj.devSpecMapFromConf()[hwclass.DISK]) except: self.log.error("Vm %s recovery failed", vmId, exc_info=True) except: self.log.error("Vm's recovery failed", exc_info=True) raise
def _recoverExistingVms(self): start_time = utils.monotonic_time() try: self.log.debug('recovery: started') # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuTopology().cores()) migration.SourceThread.setMaxOutgoingMigrations(mog) # Recover stage 1: domains from libvirt doms = getVDSMDomains() num_doms = len(doms) for idx, v in enumerate(doms): vmId = v.UUIDString() if self._recoverVm(vmId): self.log.info( 'recovery [1:%d/%d]: recovered domain %s from libvirt', idx + 1, num_doms, vmId) else: self.log.info( 'recovery [1:%d/%d]: loose domain %s found,' ' killing it.', idx + 1, num_doms, vmId) try: v.destroy() except libvirt.libvirtError: self.log.exception( 'recovery [1:%d/%d]: failed to kill loose' ' domain %s', idx + 1, num_doms, vmId) # Recover stage 2: domains from recovery files # we do this to safely handle VMs which disappeared # from the host while VDSM was down/restarting rec_vms = self._getVDSMVmsFromRecovery() num_rec_vms = len(rec_vms) if rec_vms: self.log.warning( 'recovery: found %i VMs from recovery files not' ' reported by libvirt. This should not happen!' ' Will try to recover them.', num_rec_vms) for idx, vmId in enumerate(rec_vms): if self._recoverVm(vmId): self.log.info( 'recovery [2:%d/%d]: recovered domain %s' ' from data file', idx + 1, num_rec_vms, vmId) else: self.log.warning( 'recovery [2:%d/%d]: VM %s failed to recover from data' ' file, reported as Down', idx + 1, num_rec_vms, vmId) # recover stage 3: waiting for domains to go up while self._enabled: launching = sum( int(v.lastStatus == vmstatus.WAIT_FOR_LAUNCH) for v in self.vmContainer.values()) if not launching: break else: self.log.info('recovery: waiting for %d domains to go up', launching) time.sleep(1) self._cleanOldFiles() self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations while self._enabled and self.vmContainer and \ not self.irs.getConnectedStoragePoolsList()['poollist']: self.log.info('recovery: waiting for storage pool to go up') time.sleep(5) vm_objects = self.vmContainer.values() num_vm_objects = len(vm_objects) for idx, vm_obj in enumerate(vm_objects): # Let's recover as much VMs as possible try: # Do not prepare volumes when system goes down if self._enabled: self.log.info( 'recovery [%d/%d]: preparing paths for' ' domain %s', idx + 1, num_vm_objects, vm_obj.id) vm_obj.preparePaths( vm_obj.devSpecMapFromConf()[hwclass.DISK]) except: self.log.exception("recovery [%d/%d]: failed for vm %s", idx + 1, num_vm_objects, vm_obj.id) self.log.info('recovery: completed in %is', utils.monotonic_time() - start_time) except: self.log.exception("recovery: failed") raise
def _recoverExistingVms(self): start_time = utils.monotonic_time() try: self.log.debug('recovery: started') # Starting up libvirt might take long when host under high load, # we prefer running this code in external thread to avoid blocking # API response. mog = min(config.getint('vars', 'max_outgoing_migrations'), caps.CpuTopology().cores()) migration.SourceThread.setMaxOutgoingMigrations(mog) # Recover stage 1: domains from libvirt doms = getVDSMDomains() num_doms = len(doms) for idx, v in enumerate(doms): vmId = v.UUIDString() if self._recoverVm(vmId): self.log.info( 'recovery [1:%d/%d]: recovered domain %s from libvirt', idx+1, num_doms, vmId) else: self.log.info( 'recovery [1:%d/%d]: loose domain %s found,' ' killing it.', idx+1, num_doms, vmId) try: v.destroy() except libvirt.libvirtError: self.log.exception( 'recovery [1:%d/%d]: failed to kill loose' ' domain %s', idx+1, num_doms, vmId) # Recover stage 2: domains from recovery files # we do this to safely handle VMs which disappeared # from the host while VDSM was down/restarting rec_vms = self._getVDSMVmsFromRecovery() num_rec_vms = len(rec_vms) if rec_vms: self.log.warning( 'recovery: found %i VMs from recovery files not' ' reported by libvirt. This should not happen!' ' Will try to recover them.', num_rec_vms) for idx, vmId in enumerate(rec_vms): if self._recoverVm(vmId): self.log.info( 'recovery [2:%d/%d]: recovered domain %s' ' from data file', idx+1, num_rec_vms, vmId) else: self.log.warning( 'recovery [2:%d/%d]: VM %s failed to recover from data' ' file, reported as Down', idx+1, num_rec_vms, vmId) # recover stage 3: waiting for domains to go up while self._enabled: launching = sum(int(v.lastStatus == vmstatus.WAIT_FOR_LAUNCH) for v in self.vmContainer.values()) if not launching: break else: self.log.info( 'recovery: waiting for %d domains to go up', launching) time.sleep(1) self._cleanOldFiles() self._recovery = False # Now if we have VMs to restore we should wait pool connection # and then prepare all volumes. # Actually, we need it just to get the resources for future # volumes manipulations while self._enabled and self.vmContainer and \ not self.irs.getConnectedStoragePoolsList()['poollist']: self.log.info('recovery: waiting for storage pool to go up') time.sleep(5) vm_objects = self.vmContainer.values() num_vm_objects = len(vm_objects) for idx, vm_obj in enumerate(vm_objects): # Let's recover as much VMs as possible try: # Do not prepare volumes when system goes down if self._enabled: self.log.info( 'recovery [%d/%d]: preparing paths for' ' domain %s', idx+1, num_vm_objects, vm_obj.id) vm_obj.preparePaths( vm_obj.devSpecMapFromConf()[hwclass.DISK]) except: self.log.exception( "recovery [%d/%d]: failed for vm %s", idx+1, num_vm_objects, vm_obj.id) self.log.info('recovery: completed in %is', utils.monotonic_time() - start_time) except: self.log.exception("recovery: failed") raise