def CheckListening(self): """Checks whether the daemon is listening. @rtype: bool @return: Whether the daemon is listening """ assert self._daemon, "Daemon status missing" if self._ts_listening is not None: return True port = self._daemon.listen_port if port is not None: self._ts_listening = time.time() logging.debug("Import '%s' on %s is now listening on port %s", self._daemon_name, self.node_uuid, port) self._cbs.ReportListening(self, self._private, self._component) return True if utils.TimeoutExpired(self._ts_begin, self._timeouts.listen): raise _ImportExportError("Not listening after %s seconds" % self._timeouts.listen) return False
def CheckConnected(self): """Checks whether the daemon is connected. @rtype: bool @return: Whether the daemon is connected """ assert self._daemon, "Daemon status missing" if self._ts_connected is not None: return True if self._daemon.connected: self._ts_connected = time.time() # TODO: Log remote peer logging.debug("%s '%s' on %s is now connected", self.MODE_TEXT, self._daemon_name, self.node_uuid) self._cbs.ReportConnected(self, self._private) return True if utils.TimeoutExpired(self._GetConnectedCheckEpoch(), self._timeouts.connect): raise _ImportExportError("Not connected after %s seconds" % self._timeouts.connect) return False
def _CheckProgress(self): """Checks whether a progress update should be reported. """ if ((self._ts_last_progress is None or utils.TimeoutExpired(self._ts_last_progress, self._timeouts.progress)) and self._daemon and self._daemon.progress_mbytes is not None and self._daemon.progress_throughput is not None): self._cbs.ReportProgress(self, self._private) self._ts_last_progress = time.time()
def _SetDaemonData(self, data): """Internal function for updating status daemon data. @type data: L{objects.ImportExportStatus} @param data: Daemon status data """ assert self._ts_begin is not None if not data: if utils.TimeoutExpired(self._ts_begin, self._timeouts.ready): raise _ImportExportError("Didn't become ready after %s seconds" % self._timeouts.ready) return False self._daemon = data return True
def SetDaemonData(self, success, data): """Updates daemon status data. @type success: bool @param success: Whether fetching data was successful or not @type data: L{objects.ImportExportStatus} @param data: Daemon status data """ if not success: if self._ts_last_error is None: self._ts_last_error = time.time() elif utils.TimeoutExpired(self._ts_last_error, self._timeouts.error): raise _ImportExportError("Too many errors while updating data") return False self._ts_last_error = None return self._SetDaemonData(data)
def __acquire_unlocked(self, shared, timeout, priority): """Acquire a shared lock. @param shared: whether to acquire in shared mode; by default an exclusive lock will be acquired @param timeout: maximum waiting time before giving up @type priority: integer @param priority: Priority for acquiring lock """ self.__check_deleted() # We cannot acquire the lock if we already have it assert not self.__is_owned(), ( "double acquire() on a non-recursive lock" " %s" % self.name) # Remove empty entries from queue self.__find_first_pending_queue() # Check whether someone else holds the lock or there are pending acquires. if not self.__pending and self.__can_acquire(shared): # Apparently not, can acquire lock directly. self.__do_acquire(shared) return True # The lock couldn't be acquired right away, so if a timeout is given and is # considered too short, return right away as scheduling a pending # acquisition is quite expensive if timeout is not None and timeout < _LOCK_ACQUIRE_MIN_TIMEOUT: return False prioqueue = self.__pending_by_prio.get(priority, None) if shared: # Try to re-use condition for shared acquire wait_condition = self.__pending_shared.get(priority, None) assert (wait_condition is None or (wait_condition.shared and wait_condition in prioqueue)) else: wait_condition = None if wait_condition is None: if prioqueue is None: assert priority not in self.__pending_by_prio prioqueue = [] heapq.heappush(self.__pending, (priority, prioqueue)) self.__pending_by_prio[priority] = prioqueue wait_condition = self.__condition_class(self.__lock, shared) prioqueue.append(wait_condition) if shared: # Keep reference for further shared acquires on same priority. This is # better than trying to find it in the list of pending acquires. assert priority not in self.__pending_shared self.__pending_shared[priority] = wait_condition wait_start = self.__time_fn() acquired = False try: # Wait until we become the topmost acquire in the queue or the timeout # expires. while True: if self.__is_on_top(wait_condition) and self.__can_acquire( shared): self.__do_acquire(shared) acquired = True break # A lot of code assumes blocking acquires always succeed, therefore we # can never return False for a blocking acquire if (timeout is not None and utils.TimeoutExpired( wait_start, timeout, _time_fn=self.__time_fn)): break # Wait for notification wait_condition.wait(timeout) self.__check_deleted() finally: # Remove condition from queue if there are no more waiters if not wait_condition.has_waiting(): prioqueue.remove(wait_condition) if wait_condition.shared: # Remove from list of shared acquires if it wasn't while releasing # (e.g. on lock deletion) self.__pending_shared.pop(priority, None) return acquired
def testTimeoutExpired(self): self.assert_(utils.TimeoutExpired(100, 300, _time_fn=lambda: 500)) self.assertFalse(utils.TimeoutExpired(100, 300, _time_fn=lambda: 0)) self.assertFalse(utils.TimeoutExpired(100, 300, _time_fn=lambda: 100)) self.assertFalse(utils.TimeoutExpired(100, 300, _time_fn=lambda: 400))
def _ExecMigration(self): """Migrate an instance. The migrate is done by: - change the disks into dual-master mode - wait until disks are fully synchronized again - migrate the instance - change disks on the new secondary node (the old primary) to secondary - wait until disks are fully synchronized - change disks into single-master mode """ # Check for hypervisor version mismatch and warn the user. hvspecs = [(self.instance.hypervisor, self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])] nodeinfo = self.rpc.call_node_info( [self.source_node_uuid, self.target_node_uuid], None, hvspecs) for ninfo in nodeinfo.values(): ninfo.Raise("Unable to retrieve node information from node '%s'" % ninfo.node) (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and (constants.HV_NODEINFO_KEY_VERSION in dst_info)): src_version = src_info[constants.HV_NODEINFO_KEY_VERSION] dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION] if src_version != dst_version: self.feedback_fn("* warning: hypervisor version mismatch between" " source (%s) and target (%s) node" % (src_version, dst_version)) hv = hypervisor.GetHypervisorClass(self.instance.hypervisor) if hv.VersionsSafeForMigration(src_version, dst_version): self.feedback_fn(" migrating from hypervisor version %s to %s should" " be safe" % (src_version, dst_version)) else: self.feedback_fn(" migrating from hypervisor version %s to %s is" " likely unsupported" % (src_version, dst_version)) if self.ignore_hvversions: self.feedback_fn(" continuing anyway (told to ignore version" " mismatch)") else: raise errors.OpExecError("Unsupported migration between hypervisor" " versions (%s to %s)" % (src_version, dst_version)) self.feedback_fn("* checking disk consistency between source and target") for (idx, dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)): if not CheckDiskConsistency(self.lu, self.instance, dev, self.target_node_uuid, False): raise errors.OpExecError("Disk %s is degraded or not fully" " synchronized on target node," " aborting migration" % idx) if self.current_mem > self.tgt_free_mem: if not self.allow_runtime_changes: raise errors.OpExecError("Memory ballooning not allowed and not enough" " free memory to fit instance %s on target" " node %s (have %dMB, need %dMB)" % (self.instance.name, self.cfg.GetNodeName(self.target_node_uuid), self.tgt_free_mem, self.current_mem)) self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem) rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node, self.instance, self.tgt_free_mem) rpcres.Raise("Cannot modify instance runtime memory") # First get the migration information from the remote node result = self.rpc.call_migration_info(self.source_node_uuid, self.instance) msg = result.fail_msg if msg: log_err = ("Failed fetching source migration information from %s: %s" % (self.cfg.GetNodeName(self.source_node_uuid), msg)) logging.error(log_err) raise errors.OpExecError(log_err) self.migration_info = migration_info = result.payload disks = self.cfg.GetInstanceDisks(self.instance.uuid) self._CloseInstanceDisks(self.target_node_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): # Then switch the disks to master/master mode self._GoStandalone() self._GoReconnect(True) self._WaitUntilSync() self._OpenInstanceDisks(self.source_node_uuid, False) self._OpenInstanceDisks(self.target_node_uuid, False) self.feedback_fn("* preparing %s to accept the instance" % self.cfg.GetNodeName(self.target_node_uuid)) result = self.rpc.call_accept_instance(self.target_node_uuid, self.instance, migration_info, self.nodes_ip[self.target_node_uuid]) msg = result.fail_msg if msg: logging.error("Instance pre-migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Pre-migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not pre-migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* migrating instance to %s" % self.cfg.GetNodeName(self.target_node_uuid)) cluster = self.cfg.GetClusterInfo() result = self.rpc.call_instance_migrate( self.source_node_uuid, cluster.cluster_name, self.instance, self.nodes_ip[self.target_node_uuid], self.live) msg = result.fail_msg if msg: logging.error("Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* starting memory transfer") last_feedback = time.time() cluster_migration_caps = \ cluster.hvparams.get("kvm", {}).get(constants.HV_KVM_MIGRATION_CAPS, "") migration_caps = \ self.instance.hvparams.get(constants.HV_KVM_MIGRATION_CAPS, cluster_migration_caps) # migration_caps is a ':' delimited string, so checking # if 'postcopy-ram' is a substring also covers using # x-postcopy-ram for QEMU 2.5 postcopy_enabled = "postcopy-ram" in migration_caps while True: result = self.rpc.call_instance_get_migration_status( self.source_node_uuid, self.instance) msg = result.fail_msg ms = result.payload # MigrationStatus instance if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES): logging.error("Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() if not msg: msg = "hypervisor returned failure" raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) if (postcopy_enabled and ms.status == constants.HV_MIGRATION_ACTIVE and int(ms.dirty_sync_count) >= self._POSTCOPY_SYNC_COUNT_THRESHOLD): self.feedback_fn("* finishing memory transfer with postcopy") self.rpc.call_instance_start_postcopy(self.source_node_uuid, self.instance) if self.instance.hypervisor == 'kvm': migration_active = \ ms.status in constants.HV_KVM_MIGRATION_ACTIVE_STATUSES else: migration_active = \ ms.status == constants.HV_MIGRATION_ACTIVE if not migration_active: self.feedback_fn("* memory transfer complete") break if (utils.TimeoutExpired(last_feedback, self._MIGRATION_FEEDBACK_INTERVAL) and ms.transferred_ram is not None): mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram) self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress) last_feedback = time.time() time.sleep(self._MIGRATION_POLL_INTERVAL) # Always call finalize on both source and target, they should compose # a single operation, consisting of (potentially) parallel steps, that # should be always attempted/retried together (like in _AbortMigration) # without setting any expecetations in what order they execute. result_src = self.rpc.call_instance_finalize_migration_src( self.source_node_uuid, self.instance, True, self.live) result_dst = self.rpc.call_instance_finalize_migration_dst( self.target_node_uuid, self.instance, migration_info, True) err_msg = [] if result_src.fail_msg: logging.error("Instance migration succeeded, but finalization failed" " on the source node: %s", result_src.fail_msg) err_msg.append(self.cfg.GetNodeName(self.source_node_uuid) + ': ' + result_src.fail_msg) if result_dst.fail_msg: logging.error("Instance migration succeeded, but finalization failed" " on the target node: %s", result_dst.fail_msg) err_msg.append(self.cfg.GetNodeName(self.target_node_uuid) + ': ' + result_dst.fail_msg) if err_msg: raise errors.OpExecError( "Could not finalize instance migration: %s" % ' '.join(err_msg)) # Update instance location only after finalize completed. This way, if # either finalize fails, the config still stores the old primary location, # so we can know which instance to delete if we need to (manually) clean up. self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid) self.instance = self.cfg.GetInstanceInfo(self.instance_uuid) self._CloseInstanceDisks(self.source_node_uuid) disks = self.cfg.GetInstanceDisks(self.instance_uuid) if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR): self._WaitUntilSync() self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR): self._OpenInstanceDisks(self.target_node_uuid, True) # If the instance's disk template is `rbd' or `ext' and there was a # successful migration, unmap the device from the source node. unmap_types = (constants.DT_RBD, constants.DT_EXT) if utils.AnyDiskOfType(disks, unmap_types): unmap_disks = [d for d in disks if d.dev_type in unmap_types] disks = ExpandCheckDisks(unmap_disks, unmap_disks) self.feedback_fn("* unmapping instance's disks %s from %s" % (utils.CommaJoin(d.name for d in unmap_disks), self.cfg.GetNodeName(self.source_node_uuid))) for disk in disks: result = self.rpc.call_blockdev_shutdown(self.source_node_uuid, (disk, self.instance)) msg = result.fail_msg if msg: logging.error("Migration was successful, but couldn't unmap the" " block device %s on source node %s: %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid), msg) logging.error("You need to unmap the device %s manually on %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid)) self.feedback_fn("* done")
def _ExecMigration(self): """Migrate an instance. The migrate is done by: - change the disks into dual-master mode - wait until disks are fully synchronized again - migrate the instance - change disks on the new secondary node (the old primary) to secondary - wait until disks are fully synchronized - change disks into single-master mode """ # Check for hypervisor version mismatch and warn the user. hvspecs = [(self.instance.hypervisor, self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])] nodeinfo = self.rpc.call_node_info( [self.source_node_uuid, self.target_node_uuid], None, hvspecs) for ninfo in nodeinfo.values(): ninfo.Raise("Unable to retrieve node information from node '%s'" % ninfo.node) (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and (constants.HV_NODEINFO_KEY_VERSION in dst_info)): src_version = src_info[constants.HV_NODEINFO_KEY_VERSION] dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION] if src_version != dst_version: self.feedback_fn("* warning: hypervisor version mismatch between" " source (%s) and target (%s) node" % (src_version, dst_version)) self.feedback_fn("* checking disk consistency between source and target") for (idx, dev) in enumerate(self.instance.disks): if not CheckDiskConsistency(self.lu, self.instance, dev, self.target_node_uuid, False): raise errors.OpExecError("Disk %s is degraded or not fully" " synchronized on target node," " aborting migration" % idx) if self.current_mem > self.tgt_free_mem: if not self.allow_runtime_changes: raise errors.OpExecError("Memory ballooning not allowed and not enough" " free memory to fit instance %s on target" " node %s (have %dMB, need %dMB)" % (self.instance.name, self.cfg.GetNodeName(self.target_node_uuid), self.tgt_free_mem, self.current_mem)) self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem) rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node, self.instance, self.tgt_free_mem) rpcres.Raise("Cannot modify instance runtime memory") # First get the migration information from the remote node result = self.rpc.call_migration_info(self.source_node_uuid, self.instance) msg = result.fail_msg if msg: log_err = ("Failed fetching source migration information from %s: %s" % (self.cfg.GetNodeName(self.source_node_uuid), msg)) logging.error(log_err) raise errors.OpExecError(log_err) self.migration_info = migration_info = result.payload if self.instance.disk_template not in constants.DTS_EXT_MIRROR: # Then switch the disks to master/master mode self._EnsureSecondary(self.target_node_uuid) self._GoStandalone() self._GoReconnect(True) self._WaitUntilSync() self.feedback_fn("* preparing %s to accept the instance" % self.cfg.GetNodeName(self.target_node_uuid)) result = self.rpc.call_accept_instance(self.target_node_uuid, self.instance, migration_info, self.nodes_ip[self.target_node_uuid]) msg = result.fail_msg if msg: logging.error("Instance pre-migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Pre-migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not pre-migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* migrating instance to %s" % self.cfg.GetNodeName(self.target_node_uuid)) cluster = self.cfg.GetClusterInfo() result = self.rpc.call_instance_migrate( self.source_node_uuid, cluster.cluster_name, self.instance, self.nodes_ip[self.target_node_uuid], self.live) msg = result.fail_msg if msg: logging.error("Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) self.feedback_fn("* starting memory transfer") last_feedback = time.time() while True: result = self.rpc.call_instance_get_migration_status( self.source_node_uuid, self.instance) msg = result.fail_msg ms = result.payload # MigrationStatus instance if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES): logging.error("Instance migration failed, trying to revert" " disk status: %s", msg) self.feedback_fn("Migration failed, aborting") self._AbortMigration() self._RevertDiskStatus() if not msg: msg = "hypervisor returned failure" raise errors.OpExecError("Could not migrate instance %s: %s" % (self.instance.name, msg)) if result.payload.status != constants.HV_MIGRATION_ACTIVE: self.feedback_fn("* memory transfer complete") break if (utils.TimeoutExpired(last_feedback, self._MIGRATION_FEEDBACK_INTERVAL) and ms.transferred_ram is not None): mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram) self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress) last_feedback = time.time() time.sleep(self._MIGRATION_POLL_INTERVAL) result = self.rpc.call_instance_finalize_migration_src( self.source_node_uuid, self.instance, True, self.live) msg = result.fail_msg if msg: logging.error("Instance migration succeeded, but finalization failed" " on the source node: %s", msg) raise errors.OpExecError("Could not finalize instance migration: %s" % msg) self.instance.primary_node = self.target_node_uuid # distribute new instance config to the other nodes self.cfg.Update(self.instance, self.feedback_fn) result = self.rpc.call_instance_finalize_migration_dst( self.target_node_uuid, self.instance, migration_info, True) msg = result.fail_msg if msg: logging.error("Instance migration succeeded, but finalization failed" " on the target node: %s", msg) raise errors.OpExecError("Could not finalize instance migration: %s" % msg) if self.instance.disk_template not in constants.DTS_EXT_MIRROR: self._EnsureSecondary(self.source_node_uuid) self._WaitUntilSync() self._GoStandalone() self._GoReconnect(False) self._WaitUntilSync() # If the instance's disk template is `rbd' or `ext' and there was a # successful migration, unmap the device from the source node. if self.instance.disk_template in (constants.DT_RBD, constants.DT_EXT): disks = ExpandCheckDisks(self.instance, self.instance.disks) self.feedback_fn("* unmapping instance's disks from %s" % self.cfg.GetNodeName(self.source_node_uuid)) for disk in disks: result = self.rpc.call_blockdev_shutdown(self.source_node_uuid, (disk, self.instance)) msg = result.fail_msg if msg: logging.error("Migration was successful, but couldn't unmap the" " block device %s on source node %s: %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid), msg) logging.error("You need to unmap the device %s manually on %s", disk.iv_name, self.cfg.GetNodeName(self.source_node_uuid)) self.feedback_fn("* done")