from vdsm.common.compat import pickle from vdsm.common.define import NORMAL from vdsm.common.network.address import normalize_literal_addr from vdsm.common.units import MiB from vdsm.virt.utils import DynamicBoundedSemaphore from vdsm.virt import virdomain from vdsm.virt import vmexitreason from vdsm.virt import vmstatus MODE_REMOTE = 'remote' MODE_FILE = 'file' METHOD_ONLINE = 'online' incomingMigrations = DynamicBoundedSemaphore( max(1, config.getint('vars', 'max_incoming_migrations'))) CONVERGENCE_SCHEDULE_SET_DOWNTIME = "setDowntime" CONVERGENCE_SCHEDULE_POST_COPY = "postcopy" CONVERGENCE_SCHEDULE_SET_ABORT = "abort" ADDRESS = '0' PORT = 54321 class MigrationDestinationSetupError(RuntimeError): """ Failed to create migration destination VM. """
class SourceThread(object): """ A thread that takes care of migration on the source vdsm. """ _RECOVERY_LOOP_PAUSE = 10 ongoingMigrations = DynamicBoundedSemaphore(1) def __init__(self, vm, dst='', dstparams='', mode=MODE_REMOTE, method=METHOD_ONLINE, tunneled=False, dstqemu='', abortOnError=False, consoleAddress=None, compressed=False, autoConverge=False, recovery=False, encrypted=False, **kwargs): self.log = vm.log self._vm = vm self._dst = dst self._mode = mode self._dstparams = dstparams self._enableGuestEvents = kwargs.get('enableGuestEvents', False) # TODO: conv.tobool shouldn't be used in this constructor, the # conversions should be handled properly in the API layer self._consoleAddress = consoleAddress self._dstqemu = dstqemu self._encrypted = encrypted self._maxBandwidth = int( kwargs.get('maxBandwidth') or config.getint('vars', 'migration_max_bandwidth')) self._incomingLimit = kwargs.get('incomingLimit') self._outgoingLimit = kwargs.get('outgoingLimit') self.status = { 'status': { 'code': 0, 'message': 'Migration in progress' } } # we need to guard against concurrent updates only self._lock = threading.Lock() self._progress = 0 self._thread = concurrent.thread(self.run, name='migsrc/' + self._vm.id[:8]) self._preparingMigrationEvt = True self._migrationCanceledEvt = threading.Event() self._monitorThread = None self._destServer = None self._legacy_payload_path = None if 'convergenceSchedule' in kwargs: self._convergence_schedule = kwargs['convergenceSchedule'] else: # Needed for Engine < 4.3 or when legacy migration is used # as a supposedly rare fallback in Engine >= 4.3. self._convergence_schedule = \ self._legacy_convergence_schedule(kwargs.get('downtime')) self.log.info( 'using a computed convergence schedule for ' 'a legacy migration: %s', self._convergence_schedule) self.log.debug('convergence schedule set to: %s', str(self._convergence_schedule)) self._started = False self._failed = False self._recovery = recovery tunneled = conv.tobool(tunneled) abortOnError = conv.tobool(abortOnError) compressed = conv.tobool(compressed) autoConverge = conv.tobool(autoConverge) self._migration_flags = self._calculate_migration_flags( tunneled, abortOnError, compressed, autoConverge, encrypted) def start(self): self._thread.start() def is_alive(self): return self._thread.is_alive() def migrating(self): """ Return whether the thread currently manages a migration. That can be a migration directly supervised by the source thread and other threads (such as the downtime thread) or just an indirectly managed migration (detected on Vdsm recovery) without the threads actually running. """ return ((self.is_alive() and not self._failed) or (self._recovery and self._vm.lastStatus == vmstatus.MIGRATION_SOURCE)) @property def started(self): return self._started @property def hibernating(self): return self._mode == MODE_FILE def _update_progress(self): if self._monitorThread is None: return # fetch migration status from the monitor thread if self._monitorThread.progress is not None: progress = self._monitorThread.progress.percentage else: progress = 0 with self._lock: old_progress = self._progress if progress >= old_progress: self._progress = progress if progress < old_progress: self.log.info( 'new computed progress %d < than old value %d, discarded', progress, old_progress) def getStat(self): """ Get the status of the migration. """ self._update_progress() self.status['progress'] = self._progress return self.status def _createClient(self, port): sslctx = sslutils.create_ssl_context() def is_ipv6_address(a): return (':' in a) and a.startswith('[') and a.endswith(']') if is_ipv6_address(self.remoteHost): host = self.remoteHost[1:-1] else: host = self.remoteHost client_socket = utils.create_connected_socket(host, int(port), sslctx) return self._vm.cif.createStompClient(client_socket) def _setupVdsConnection(self): if self.hibernating: return hostPort = _cannonize_host_port( self._dst, config.getint('addresses', 'management_port')) self.remoteHost, port = hostPort.rsplit(':', 1) client = self._createClient(port) requestQueues = config.get('addresses', 'request_queues') requestQueue = requestQueues.split(",")[0] self._destServer = jsonrpcvdscli.connect(requestQueue, client) self.log.debug('Initiating connection with destination') self._destServer.ping() self.log.debug('Destination server is: ' + hostPort) def _setupRemoteMachineParams(self): machineParams = self._vm.migration_parameters() machineParams['enableGuestEvents'] = self._enableGuestEvents if not self.hibernating: machineParams['migrationDest'] = 'libvirt' return machineParams def _prepareGuest(self): if self.hibernating: self.log.debug("Save State begins") if self._vm.guestAgent.isResponsive(): lockTimeout = 30 else: lockTimeout = 0 self._vm.guestAgent.desktopLock() # wait for lock or timeout while lockTimeout: if self._vm.getStats()['session'] in ["Locked", "LoggedOff"]: break time.sleep(1) lockTimeout -= 1 if lockTimeout == 0: self.log.warning('Agent ' + self._vm.id + ' unresponsive. Hiberanting without ' 'desktopLock.') break self._vm.pause(vmstatus.SAVING_STATE) else: self.log.debug("Migration started") self._vm.lastStatus = vmstatus.MIGRATION_SOURCE def _recover(self, message): if not response.is_error(self.status): self.status = response.error('migrateErr') self.log.error(message) if not self.hibernating and self._destServer is not None: if self._vm.post_copy == PostCopyPhase.RUNNING: # We can't recover a VM after a failed post-copy migration. # And the destination takes care of the situation itself. self._vm.handle_failed_post_copy(clean_vm=True) return try: self._destServer.destroy(self._vm.id) except Exception: self.log.exception("Failed to destroy remote VM") # if the guest was stopped before migration, we need to cont it if self.hibernating: self._vm.cont(ignoreStatus=True) if self._enableGuestEvents: self._vm.guestAgent.events.after_hibernation_failure() elif self._enableGuestEvents: self._vm.guestAgent.events.after_migration_failure() # either way, migration has finished self._failed = True if self._recovery: self._vm.set_last_status(vmstatus.UP, vmstatus.MIGRATION_SOURCE) self._recovery = False else: self._vm.lastStatus = vmstatus.UP self._started = False self._vm.send_status_event() def _finishSuccessfully(self, machineParams): with self._lock: self._progress = 100 if not self.hibernating: # TODO: We could use a timeout on the wait to be more robust # against "impossible" failures. But we don't have a good value to # use here now. self._vm.stopped_migrated_event_processed.wait() self._vm.setDownStatus(NORMAL, vmexitreason.MIGRATION_SUCCEEDED) self.status['status']['message'] = 'Migration done' if self._vm.post_copy == PostCopyPhase.RUNNING: self._vm.destroy() else: # don't pickle transient params for ignoreParam in ('displayIp', 'display', 'pid'): if ignoreParam in machineParams: del machineParams[ignoreParam] fname = self._vm.cif.prepareVolumePath(self._dstparams) try: # Use r+ to avoid truncating the file, see BZ#1282239 with io.open(fname, "r+b") as f: # protocol=2 is needed for clusters < 4.4 # (for Python 2 host compatibility) pickle.dump(machineParams, f, protocol=2) finally: self._vm.cif.teardownVolumePath(self._dstparams) self._vm.setDownStatus(NORMAL, vmexitreason.SAVE_STATE_SUCCEEDED) self.status['status']['message'] = 'SaveState done' @staticmethod def _raiseAbortError(): e = libvirt.libvirtError(defmsg='') # we have to override the value to get what we want # err might be None e.err = ( libvirt.VIR_ERR_OPERATION_ABORTED, # error code libvirt.VIR_FROM_QEMU, # error domain 'operation aborted', # error message libvirt.VIR_ERR_WARNING, # error level '', '', '', # str1, str2, str3, -1, -1) # int1, int2 raise e def _update_outgoing_limit(self): if self._outgoingLimit: self.log.debug('Setting outgoing migration limit to %s', self._outgoingLimit) SourceThread.ongoingMigrations.bound = self._outgoingLimit @property def recovery(self): """ Return whether the source thread handles a recovered migration. This is when we detect the VM is migrating in Vdsm recovery and the source thread is not actually running. This serves to handle a possible already running migration detected during Vdsm recovery, for which no regular source thread exists. We don't try to touch such a migration, but we still must ensure at least basic sanity: - Indication that the migration is running. - Canceling the migration. - Putting the VM into proper status after migration failure (in case the migration succeeds, we rely on the fact that the VM disappears and Vdsm detects that sooner or later). .. note:: Just setting this flag doesn't mean that any migration is actually running, it just means that if a migration is running then the migration was started by another Vdsm instance. When this flag is set then the VM may be actually migrating only if its status is `vmstatus.MIGRATION_SOURCE` or `vmstatus.WAIT_FOR_LAUNCH` (the latter is mostly irrelevant since we prevent most actions in that status). """ return self._recovery def run(self): if self.recovery: self._recovery_run() else: self._regular_run() def _regular_run(self): self.log.debug("Starting migration source thread") self._recovery = False self._update_outgoing_limit() try: startTime = time.time() # Guest agent API version must be updated before _srcDomXML # is created to have the version in _srcDomXML metadata. self._vm.update_guest_agent_api_version() machineParams = self._setupRemoteMachineParams() self._setupVdsConnection() self._prepareGuest() while not self._started: try: self.log.info("Migration semaphore: acquiring") with SourceThread.ongoingMigrations: self.log.info("Migration semaphore: acquired") timeout = config.getint( 'vars', 'guest_lifecycle_event_reply_timeout') if self.hibernating: self._vm.guestAgent.events.before_hibernation( wait_timeout=timeout) elif self._enableGuestEvents: self._vm.guestAgent.events.before_migration( wait_timeout=timeout) if self._migrationCanceledEvt.is_set(): self._raiseAbortError() self.log.debug( "migration semaphore acquired " "after %d seconds", time.time() - startTime) self._startUnderlyingMigration(time.time(), machineParams) self._finishSuccessfully(machineParams) except libvirt.libvirtError as e: if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED: self.status = response.error( 'migCancelErr', message='Migration canceled') raise except MigrationLimitExceeded: retry_timeout = config.getint('vars', 'migration_retry_timeout') self.log.debug( "Migration destination busy. Initiating " "retry in %d seconds.", retry_timeout) self._migrationCanceledEvt.wait(retry_timeout) except MigrationDestinationSetupError as e: self._recover(str(e)) # we know what happened, no need to dump hollow stack trace except Exception as e: self._recover(str(e)) self.log.exception("Failed to migrate") def _startUnderlyingMigration(self, startTime, machineParams): if self.hibernating: self._started = True self._vm.hibernate(self._dst) else: self._vm.prepare_migration() # Do not measure the time spent for creating the VM on the # destination. In some cases some expensive operations can cause # the migration to get cancelled right after the transfer started. destCreateStartTime = time.time() result = self._destServer.migrationCreate(machineParams, self._incomingLimit) destCreationTime = time.time() - destCreateStartTime startTime += destCreationTime self.log.info('Creation of destination VM took: %d seconds', destCreationTime) if response.is_error(result): self.status = result if response.is_error(result, 'migrateLimit'): raise MigrationLimitExceeded() else: raise MigrationDestinationSetupError( 'migration destination error: ' + result['status']['message']) self._started = True # REQUIRED_FOR: destination Vdsm < 4.3 if not self._vm.min_cluster_version(4, 3): payload_drives = self._vm.payload_drives() if payload_drives: # Currently, only a single payload device may be present payload_alias = payload_drives[0].alias result = self._destServer.fullList(vmList=(self._vm.id, )) vm_list = result.get('items') remote_devices = vm_list[0].get('devices') if remote_devices is not None: payload_path = next( (d['path'] for d in remote_devices if d.get('alias') == payload_alias), None) if payload_path is not None: self._legacy_payload_path = \ (payload_alias, payload_path) if config.getboolean('vars', 'ssl'): transport = 'tls' else: transport = 'tcp' duri = 'qemu+{}://{}/system'.format( transport, normalize_literal_addr(self.remoteHost)) if self._encrypted: # TODO: Stop using host names here and set the host # name based certificate verification parameter once # the corresponding functionality is available in # libvirt, see https://bugzilla.redhat.com/1754533 # # When an encrypted migration is requested, we must # use the host name (stored in 'dst') rather than the # IP address (stored in 'dstqemu') in order to match # the target certificate. That means that encrypted # migrations are incompatible with setups that require # an IP address to identify the host properly, such as # when a separate migration network should be used or # when using IPv4/IPv6 dual stack configurations. dstqemu = self.remoteHost else: dstqemu = self._dstqemu if dstqemu: muri = 'tcp://{}'.format(normalize_literal_addr(dstqemu)) else: muri = 'tcp://{}'.format( normalize_literal_addr(self.remoteHost)) self._vm.log.info('starting migration to %s ' 'with miguri %s', duri, muri) self._monitorThread = MonitorThread(self._vm, startTime, self._convergence_schedule) self._perform_with_conv_schedule(duri, muri) self.log.info("migration took %d seconds to complete", (time.time() - startTime) + destCreationTime) def _perform_migration(self, duri, muri): if self._vm.hasSpice and self._vm.conf.get('clientIp'): SPICE_MIGRATION_HANDOVER_TIME = 120 self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME) # FIXME: there still a race here with libvirt, # if we call stop() and libvirt migrateToURI3 didn't start # we may return migration stop but it will start at libvirt # side self._preparingMigrationEvt = False if not self._migrationCanceledEvt.is_set(): self._vm._dom.migrateToURI3(duri, self._migration_params(muri), self._migration_flags) else: self._raiseAbortError() def _migration_params(self, muri): params = {libvirt.VIR_MIGRATE_PARAM_BANDWIDTH: self._maxBandwidth} if not self.tunneled: params[libvirt.VIR_MIGRATE_PARAM_URI] = str(muri) if self._consoleAddress: graphics = 'spice' if self._vm.hasSpice else 'vnc' params[libvirt.VIR_MIGRATE_PARAM_GRAPHICS_URI] = str( '%s://%s' % (graphics, self._consoleAddress)) # REQUIRED_FOR: destination Vdsm < 4.3 if self._legacy_payload_path is not None: alias, path = self._legacy_payload_path dom = xmlutils.fromstring(self._vm.migratable_domain_xml()) source = dom.find(".//alias[@name='%s']/../source" % (alias, )) source.set('file', path) xml = xmlutils.tostring(dom) self._vm.log.debug("Migrating domain XML: %s", xml) params[libvirt.VIR_MIGRATE_PARAM_DEST_XML] = xml return params @property def tunneled(self): return self.migration_flags & libvirt.VIR_MIGRATE_TUNNELLED @property def migration_flags(self): return self._migration_flags def _calculate_migration_flags(self, tunneled, abort_on_error, compressed, auto_converge, encrypted): flags = libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER if tunneled: flags |= libvirt.VIR_MIGRATE_TUNNELLED if abort_on_error: flags |= libvirt.VIR_MIGRATE_ABORT_ON_ERROR if compressed: flags |= libvirt.VIR_MIGRATE_COMPRESSED if auto_converge: flags |= libvirt.VIR_MIGRATE_AUTO_CONVERGE if encrypted: flags |= libvirt.VIR_MIGRATE_TLS if self._vm.min_cluster_version(4, 2): flags |= libvirt.VIR_MIGRATE_PERSIST_DEST # Migration may fail immediately when VIR_MIGRATE_POSTCOPY flag is # present in the following situations: # - The transport is not capable of full bidirectional # connectivity: RDMA, tunnelled, pipe. # - Huge pages are used (doesn't apply to transparent huge pages). # - QEMU uses a file as a backing for memory. # - Perhaps non-shared block storage may cause some trouble. for stalling in self._convergence_schedule.get('stalling', []): action = stalling.get('action', {}).get('name') if action == CONVERGENCE_SCHEDULE_POST_COPY: flags |= libvirt.VIR_MIGRATE_POSTCOPY break return flags def _perform_with_conv_schedule(self, duri, muri): self._vm.log.debug('performing migration with conv schedule') with utils.running(self._monitorThread): self._perform_migration(duri, muri) self._monitorThread.join() def _legacy_convergence_schedule(self, max_downtime): # Simplified emulation of legacy non-scheduled migrations. if max_downtime is None: max_downtime = config.get('vars', 'migration_downtime') max_downtime = int(max_downtime) max_steps = config.getint('vars', 'migration_downtime_steps') downtimes = exponential_downtime(max_downtime, max_steps) def downtime_action(downtime): return {'params': [str(downtime)], 'name': 'setDowntime'} init = [downtime_action(next(downtimes))] stalling = [] limit = 1 for d in downtimes: stalling.append({'action': downtime_action(d), 'limit': limit}) limit += 1 stalling.append({'action': downtime_action(d), 'limit': 42}) stalling.append({ 'action': { 'params': [], 'name': 'abort' }, 'limit': -1 }) return {'init': init, 'stalling': stalling} def set_max_bandwidth(self, bandwidth): self._vm.log.debug('setting migration max bandwidth to %d', bandwidth) self._maxBandwidth = bandwidth self._vm._dom.migrateSetMaxSpeed(bandwidth) def stop(self): # if its locks we are before the migrateToURI3() # call so no need to abortJob() try: self._migrationCanceledEvt.set() self._vm._dom.abortJob() except libvirt.libvirtError: if not self._preparingMigrationEvt: raise if self._recovery: self._recover("Migration stopped") def _recovery_run(self): self.log.debug("Starting migration recovery thread") while True: job_stats = self._vm.job_stats() if not ongoing(job_stats): break time.sleep(self._RECOVERY_LOOP_PAUSE) self.log.debug("Recovered migration finished") # Successful migration is handled in VM.onJobCompleted, here we need # just to ensure that migration failures are detected and handled. if self._vm._dom.state(0)[0] == libvirt.VIR_DOMAIN_RUNNING: self.recovery_cleanup() def recovery_cleanup(self): """ Finish and cleanup recovery migration if necessary. This is to handle the situation when we detect a failed migration outside the source thread. The source thread usually handles failed migrations itself. But the thread is not running after recovery so in such a case the source thread must be notified about the failed migration. This is what this method serves for. """ if self._recovery and \ self._vm.lastStatus == vmstatus.MIGRATION_SOURCE: self._recover("Migration failed")
class SourceThread(object): """ A thread that takes care of migration on the source vdsm. """ ongoingMigrations = DynamicBoundedSemaphore(1) def __init__(self, vm, dst='', dstparams='', mode=MODE_REMOTE, method=METHOD_ONLINE, tunneled=False, dstqemu='', abortOnError=False, consoleAddress=None, compressed=False, autoConverge=False, **kwargs): self.log = vm.log self._vm = vm self._dst = dst self._mode = mode if method != METHOD_ONLINE: self.log.warning( 'migration method %s is deprecated, forced to "online"', method) self._dstparams = dstparams self._enableGuestEvents = kwargs.get('enableGuestEvents', False) self._machineParams = {} # TODO: utils.tobool shouldn't be used in this constructor, the # conversions should be handled properly in the API layer self._tunneled = utils.tobool(tunneled) self._abortOnError = utils.tobool(abortOnError) self._consoleAddress = consoleAddress self._dstqemu = dstqemu self._downtime = kwargs.get('downtime') or \ config.get('vars', 'migration_downtime') self._maxBandwidth = int( kwargs.get('maxBandwidth') or config.getint('vars', 'migration_max_bandwidth')) self._autoConverge = utils.tobool(autoConverge) self._compressed = utils.tobool(compressed) self._incomingLimit = kwargs.get('incomingLimit') self._outgoingLimit = kwargs.get('outgoingLimit') self.status = { 'status': { 'code': 0, 'message': 'Migration in progress' } } self._progress = 0 self._thread = concurrent.thread(self.run) self._preparingMigrationEvt = True self._migrationCanceledEvt = threading.Event() self._monitorThread = None self._destServer = None self._convergence_schedule = {'init': [], 'stalling': []} self._use_convergence_schedule = False if 'convergenceSchedule' in kwargs: self._convergence_schedule = kwargs.get('convergenceSchedule') self._use_convergence_schedule = True self.log.debug('convergence schedule set to: %s', str(self._convergence_schedule)) def start(self): self._thread.start() def is_alive(self): return self._thread.is_alive() @property def hibernating(self): return self._mode == MODE_FILE def getStat(self): """ Get the status of the migration. """ if self._monitorThread is not None: # fetch migration status from the monitor thread if self._monitorThread.progress is not None: self._progress = self._monitorThread.progress.percentage else: self._progress = 0 self.status['progress'] = self._progress stat = self._vm._dom.jobStats(libvirt.VIR_DOMAIN_JOB_STATS_COMPLETED) if 'downtime_net' in stat: self.status['downtime'] = stat['downtime_net'] return self.status def _createClient(self, port): sslctx = sslutils.create_ssl_context() def is_ipv6_address(a): return (':' in a) and a.startswith('[') and a.endswith(']') if is_ipv6_address(self.remoteHost): host = self.remoteHost[1:-1] else: host = self.remoteHost client_socket = utils.create_connected_socket(host, int(port), sslctx) return self._vm.cif.createStompClient(client_socket) def _setupVdsConnection(self): if self.hibernating: return hostPort = vdscli.cannonizeHostPort( self._dst, config.getint('addresses', 'management_port')) self.remoteHost, port = hostPort.rsplit(':', 1) try: client = self._createClient(port) requestQueues = config.get('addresses', 'request_queues') requestQueue = requestQueues.split(",")[0] self._destServer = jsonrpcvdscli.connect(requestQueue, client) self.log.debug('Initiating connection with destination') self._destServer.ping() except (JsonRpcBindingsError, JsonRpcNoResponseError): if config.getboolean('vars', 'ssl'): self._destServer = vdscli.connect( hostPort, useSSL=True, TransportClass=kaxmlrpclib.TcpkeepSafeTransport) else: self._destServer = kaxmlrpclib.Server('http://' + hostPort) self.log.debug('Destination server is: ' + hostPort) def _setupRemoteMachineParams(self): self._machineParams.update(self._vm.status()) # patch VM config for targets < 3.1 self._patchConfigForLegacy() self._machineParams['elapsedTimeOffset'] = \ time.time() - self._vm._startTime vmStats = self._vm.getStats() if 'username' in vmStats: self._machineParams['username'] = vmStats['username'] if 'guestIPs' in vmStats: self._machineParams['guestIPs'] = vmStats['guestIPs'] if 'guestFQDN' in vmStats: self._machineParams['guestFQDN'] = vmStats['guestFQDN'] self._machineParams['guestAgentAPIVersion'] = \ self._vm.guestAgent.effectiveApiVersion for k in ('_migrationParams', 'pid'): if k in self._machineParams: del self._machineParams[k] if not self.hibernating: self._machineParams['migrationDest'] = 'libvirt' self._machineParams['_srcDomXML'] = self._vm._dom.XMLDesc(0) self._machineParams['enableGuestEvents'] = self._enableGuestEvents def _prepareGuest(self): if self.hibernating: self.log.debug("Save State begins") if self._vm.guestAgent.isResponsive(): lockTimeout = 30 else: lockTimeout = 0 self._vm.guestAgent.desktopLock() # wait for lock or timeout while lockTimeout: if self._vm.getStats()['session'] in ["Locked", "LoggedOff"]: break time.sleep(1) lockTimeout -= 1 if lockTimeout == 0: self.log.warning('Agent ' + self._vm.id + ' unresponsive. Hiberanting without ' 'desktopLock.') break self._vm.pause(vmstatus.SAVING_STATE) else: self.log.debug("Migration started") self._vm.lastStatus = vmstatus.MIGRATION_SOURCE def _recover(self, message): if not response.is_error(self.status): self.status = response.error('migrateErr') self.log.error(message) if not self.hibernating and self._destServer is not None: try: self._destServer.destroy(self._vm.id) except Exception: self.log.exception("Failed to destroy remote VM") # if the guest was stopped before migration, we need to cont it if self.hibernating: self._vm.cont(ignoreStatus=True) if self._enableGuestEvents: self._vm.guestAgent.events.after_hibernation_failure() elif self._enableGuestEvents: self._vm.guestAgent.events.after_migration_failure() # either way, migration has finished self._vm.lastStatus = vmstatus.UP self._vm.send_status_event() def _finishSuccessfully(self): self._progress = 100 if not self.hibernating: self._vm.setDownStatus(NORMAL, vmexitreason.MIGRATION_SUCCEEDED) self.status['status']['message'] = 'Migration done' else: # don't pickle transient params for ignoreParam in ('displayIp', 'display', 'pid'): if ignoreParam in self._machineParams: del self._machineParams[ignoreParam] fname = self._vm.cif.prepareVolumePath(self._dstparams) try: # Use r+ to avoid truncating the file, see BZ#1282239 with open(fname, "r+") as f: pickle.dump(self._machineParams, f) finally: self._vm.cif.teardownVolumePath(self._dstparams) self._vm.setDownStatus(NORMAL, vmexitreason.SAVE_STATE_SUCCEEDED) self.status['status']['message'] = 'SaveState done' def _patchConfigForLegacy(self): """ Remove from the VM config drives list "cdrom" and "floppy" items and set them up as full paths """ # care only about "drives" list, since # "devices" doesn't cause errors if 'drives' in self._machineParams: for item in ("cdrom", "floppy"): new_drives = [] for drive in self._machineParams['drives']: if drive['device'] == item: self._machineParams[item] = drive['path'] else: new_drives.append(drive) self._machineParams['drives'] = new_drives # vdsm < 4.13 expect this to exist self._machineParams['afterMigrationStatus'] = '' @staticmethod def _raiseAbortError(): e = libvirt.libvirtError(defmsg='') # we have to override the value to get what we want # err might be None e.err = ( libvirt.VIR_ERR_OPERATION_ABORTED, # error code libvirt.VIR_FROM_QEMU, # error domain 'operation aborted', # error message libvirt.VIR_ERR_WARNING, # error level '', '', '', # str1, str2, str3, -1, -1) # int1, int2 raise e def _update_outgoing_limit(self): if self._outgoingLimit: self.log.debug('Setting outgoing migration limit to %s', self._outgoingLimit) SourceThread.ongoingMigrations.bound = self._outgoingLimit def run(self): self._update_outgoing_limit() try: startTime = time.time() self._setupVdsConnection() self._setupRemoteMachineParams() self._prepareGuest() while self._progress < 100: try: with SourceThread.ongoingMigrations: timeout = config.getint( 'vars', 'guest_lifecycle_event_reply_timeout') if self.hibernating: self._vm.guestAgent.events.before_hibernation( wait_timeout=timeout) elif self._enableGuestEvents: self._vm.guestAgent.events.before_migration( wait_timeout=timeout) if self._migrationCanceledEvt.is_set(): self._raiseAbortError() self.log.debug( "migration semaphore acquired " "after %d seconds", time.time() - startTime) params = { 'dst': self._dst, 'mode': self._mode, 'method': METHOD_ONLINE, 'dstparams': self._dstparams, 'dstqemu': self._dstqemu, } with self._vm.migration_parameters(params): self._vm.saveState() self._startUnderlyingMigration(time.time()) self._finishSuccessfully() except libvirt.libvirtError as e: if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED: self.status = response.error( 'migCancelErr', message='Migration canceled') raise except MigrationLimitExceeded: retry_timeout = config.getint('vars', 'migration_retry_timeout') self.log.debug( "Migration destination busy. Initiating " "retry in %d seconds.", retry_timeout) self._migrationCanceledEvt.wait(retry_timeout) except MigrationDestinationSetupError as e: self._recover(str(e)) # we know what happened, no need to dump hollow stack trace except Exception as e: self._recover(str(e)) self.log.exception("Failed to migrate") def _startUnderlyingMigration(self, startTime): if self.hibernating: hooks.before_vm_hibernate(self._vm._dom.XMLDesc(0), self._vm.conf) fname = self._vm.cif.prepareVolumePath(self._dst) try: self._vm._dom.save(fname) finally: self._vm.cif.teardownVolumePath(self._dst) else: for dev in self._vm._customDevices(): hooks.before_device_migrate_source(dev._deviceXML, self._vm.conf, dev.custom) hooks.before_vm_migrate_source(self._vm._dom.XMLDesc(0), self._vm.conf) # Do not measure the time spent for creating the VM on the # destination. In some cases some expensive operations can cause # the migration to get cancelled right after the transfer started. destCreateStartTime = time.time() result = self._destServer.migrationCreate(self._machineParams, self._incomingLimit) destCreationTime = time.time() - destCreateStartTime startTime += destCreationTime self.log.info('Creation of destination VM took: %d seconds', destCreationTime) if response.is_error(result): self.status = result if response.is_error(result, 'migrateLimit'): raise MigrationLimitExceeded() else: raise MigrationDestinationSetupError( 'migration destination error: ' + result['status']['message']) if config.getboolean('vars', 'ssl'): transport = 'tls' else: transport = 'tcp' duri = 'qemu+%s://%s/system' % (transport, self.remoteHost) if self._vm.conf['_migrationParams']['dstqemu']: muri = 'tcp://%s' % \ self._vm.conf['_migrationParams']['dstqemu'] else: muri = 'tcp://%s' % self.remoteHost self._vm.log.info('starting migration to %s ' 'with miguri %s', duri, muri) self._monitorThread = MonitorThread(self._vm, startTime, self._convergence_schedule, self._use_convergence_schedule) if self._use_convergence_schedule: self._perform_with_conv_schedule(duri, muri) else: self._perform_with_downtime_thread(duri, muri) self.log.info("migration took %d seconds to complete", (time.time() - startTime) + destCreationTime) def _perform_migration(self, duri, muri): if self._vm.hasSpice and self._vm.conf.get('clientIp'): SPICE_MIGRATION_HANDOVER_TIME = 120 self._vm._reviveTicket(SPICE_MIGRATION_HANDOVER_TIME) # FIXME: there still a race here with libvirt, # if we call stop() and libvirt migrateToURI3 didn't start # we may return migration stop but it will start at libvirt # side self._preparingMigrationEvt = False if not self._migrationCanceledEvt.is_set(): # TODO: use libvirt constants when bz#1222795 is fixed params = { VIR_MIGRATE_PARAM_URI: str(muri), VIR_MIGRATE_PARAM_BANDWIDTH: self._maxBandwidth } if self._consoleAddress: if self._vm.hasSpice: graphics = 'spice' else: graphics = 'vnc' params[VIR_MIGRATE_PARAM_GRAPHICS_URI] = str( '%s://%s' % (graphics, self._consoleAddress)) flags = ( libvirt.VIR_MIGRATE_LIVE | libvirt.VIR_MIGRATE_PEER2PEER | (libvirt.VIR_MIGRATE_TUNNELLED if self._tunneled else 0) | (libvirt.VIR_MIGRATE_ABORT_ON_ERROR if self._abortOnError else 0) | (libvirt.VIR_MIGRATE_COMPRESSED if self._compressed else 0) | (libvirt.VIR_MIGRATE_AUTO_CONVERGE if self._autoConverge else 0)) self._vm._dom.migrateToURI3(duri, params, flags) else: self._raiseAbortError() def _perform_with_downtime_thread(self, duri, muri): self._vm.log.debug('performing migration with downtime thread') self._monitorThread.downtime_thread = DowntimeThread( self._vm, int(self._downtime), config.getint('vars', 'migration_downtime_steps')) with utils.running(self._monitorThread): self._perform_migration(duri, muri) self._monitorThread.join() def _perform_with_conv_schedule(self, duri, muri): self._vm.log.debug('performing migration with conv schedule') with utils.running(self._monitorThread): self._perform_migration(duri, muri) self._monitorThread.join() def set_max_bandwidth(self, bandwidth): self._vm.log.debug('setting migration max bandwidth to %d', bandwidth) self._maxBandwidth = bandwidth self._vm._dom.migrateSetMaxSpeed(bandwidth) def stop(self): # if its locks we are before the migrateToURI3() # call so no need to abortJob() try: self._migrationCanceledEvt.set() self._vm._dom.abortJob() except libvirt.libvirtError: if not self._preparingMigrationEvt: raise