示例#1
0
  def CheckListening(self):
    """Checks whether the daemon is listening.

    @rtype: bool
    @return: Whether the daemon is listening

    """
    assert self._daemon, "Daemon status missing"

    if self._ts_listening is not None:
      return True

    port = self._daemon.listen_port
    if port is not None:
      self._ts_listening = time.time()

      logging.debug("Import '%s' on %s is now listening on port %s",
                    self._daemon_name, self.node_uuid, port)

      self._cbs.ReportListening(self, self._private, self._component)

      return True

    if utils.TimeoutExpired(self._ts_begin, self._timeouts.listen):
      raise _ImportExportError("Not listening after %s seconds" %
                               self._timeouts.listen)

    return False
示例#2
0
  def CheckConnected(self):
    """Checks whether the daemon is connected.

    @rtype: bool
    @return: Whether the daemon is connected

    """
    assert self._daemon, "Daemon status missing"

    if self._ts_connected is not None:
      return True

    if self._daemon.connected:
      self._ts_connected = time.time()

      # TODO: Log remote peer
      logging.debug("%s '%s' on %s is now connected",
                    self.MODE_TEXT, self._daemon_name, self.node_uuid)

      self._cbs.ReportConnected(self, self._private)

      return True

    if utils.TimeoutExpired(self._GetConnectedCheckEpoch(),
                            self._timeouts.connect):
      raise _ImportExportError("Not connected after %s seconds" %
                               self._timeouts.connect)

    return False
示例#3
0
  def _CheckProgress(self):
    """Checks whether a progress update should be reported.

    """
    if ((self._ts_last_progress is None or
        utils.TimeoutExpired(self._ts_last_progress,
                             self._timeouts.progress)) and
        self._daemon and
        self._daemon.progress_mbytes is not None and
        self._daemon.progress_throughput is not None):
      self._cbs.ReportProgress(self, self._private)
      self._ts_last_progress = time.time()
示例#4
0
  def _SetDaemonData(self, data):
    """Internal function for updating status daemon data.

    @type data: L{objects.ImportExportStatus}
    @param data: Daemon status data

    """
    assert self._ts_begin is not None

    if not data:
      if utils.TimeoutExpired(self._ts_begin, self._timeouts.ready):
        raise _ImportExportError("Didn't become ready after %s seconds" %
                                 self._timeouts.ready)

      return False

    self._daemon = data

    return True
示例#5
0
  def SetDaemonData(self, success, data):
    """Updates daemon status data.

    @type success: bool
    @param success: Whether fetching data was successful or not
    @type data: L{objects.ImportExportStatus}
    @param data: Daemon status data

    """
    if not success:
      if self._ts_last_error is None:
        self._ts_last_error = time.time()

      elif utils.TimeoutExpired(self._ts_last_error, self._timeouts.error):
        raise _ImportExportError("Too many errors while updating data")

      return False

    self._ts_last_error = None

    return self._SetDaemonData(data)
示例#6
0
    def __acquire_unlocked(self, shared, timeout, priority):
        """Acquire a shared lock.

    @param shared: whether to acquire in shared mode; by default an
        exclusive lock will be acquired
    @param timeout: maximum waiting time before giving up
    @type priority: integer
    @param priority: Priority for acquiring lock

    """
        self.__check_deleted()

        # We cannot acquire the lock if we already have it
        assert not self.__is_owned(), (
            "double acquire() on a non-recursive lock"
            " %s" % self.name)

        # Remove empty entries from queue
        self.__find_first_pending_queue()

        # Check whether someone else holds the lock or there are pending acquires.
        if not self.__pending and self.__can_acquire(shared):
            # Apparently not, can acquire lock directly.
            self.__do_acquire(shared)
            return True

        # The lock couldn't be acquired right away, so if a timeout is given and is
        # considered too short, return right away as scheduling a pending
        # acquisition is quite expensive
        if timeout is not None and timeout < _LOCK_ACQUIRE_MIN_TIMEOUT:
            return False

        prioqueue = self.__pending_by_prio.get(priority, None)

        if shared:
            # Try to re-use condition for shared acquire
            wait_condition = self.__pending_shared.get(priority, None)
            assert (wait_condition is None
                    or (wait_condition.shared and wait_condition in prioqueue))
        else:
            wait_condition = None

        if wait_condition is None:
            if prioqueue is None:
                assert priority not in self.__pending_by_prio

                prioqueue = []
                heapq.heappush(self.__pending, (priority, prioqueue))
                self.__pending_by_prio[priority] = prioqueue

            wait_condition = self.__condition_class(self.__lock, shared)
            prioqueue.append(wait_condition)

            if shared:
                # Keep reference for further shared acquires on same priority. This is
                # better than trying to find it in the list of pending acquires.
                assert priority not in self.__pending_shared
                self.__pending_shared[priority] = wait_condition

        wait_start = self.__time_fn()
        acquired = False

        try:
            # Wait until we become the topmost acquire in the queue or the timeout
            # expires.
            while True:
                if self.__is_on_top(wait_condition) and self.__can_acquire(
                        shared):
                    self.__do_acquire(shared)
                    acquired = True
                    break

                # A lot of code assumes blocking acquires always succeed, therefore we
                # can never return False for a blocking acquire
                if (timeout is not None and utils.TimeoutExpired(
                        wait_start, timeout, _time_fn=self.__time_fn)):
                    break

                # Wait for notification
                wait_condition.wait(timeout)
                self.__check_deleted()
        finally:
            # Remove condition from queue if there are no more waiters
            if not wait_condition.has_waiting():
                prioqueue.remove(wait_condition)
                if wait_condition.shared:
                    # Remove from list of shared acquires if it wasn't while releasing
                    # (e.g. on lock deletion)
                    self.__pending_shared.pop(priority, None)

        return acquired
 def testTimeoutExpired(self):
     self.assert_(utils.TimeoutExpired(100, 300, _time_fn=lambda: 500))
     self.assertFalse(utils.TimeoutExpired(100, 300, _time_fn=lambda: 0))
     self.assertFalse(utils.TimeoutExpired(100, 300, _time_fn=lambda: 100))
     self.assertFalse(utils.TimeoutExpired(100, 300, _time_fn=lambda: 400))
示例#8
0
  def _ExecMigration(self):
    """Migrate an instance.

    The migrate is done by:
      - change the disks into dual-master mode
      - wait until disks are fully synchronized again
      - migrate the instance
      - change disks on the new secondary node (the old primary) to secondary
      - wait until disks are fully synchronized
      - change disks into single-master mode

    """
    # Check for hypervisor version mismatch and warn the user.
    hvspecs = [(self.instance.hypervisor,
                self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])]
    nodeinfo = self.rpc.call_node_info(
                 [self.source_node_uuid, self.target_node_uuid], None, hvspecs)
    for ninfo in nodeinfo.values():
      ninfo.Raise("Unable to retrieve node information from node '%s'" %
                  ninfo.node)
    (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload
    (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload

    if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
        (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
      src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
      dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
      if src_version != dst_version:
        self.feedback_fn("* warning: hypervisor version mismatch between"
                         " source (%s) and target (%s) node" %
                         (src_version, dst_version))
        hv = hypervisor.GetHypervisorClass(self.instance.hypervisor)
        if hv.VersionsSafeForMigration(src_version, dst_version):
          self.feedback_fn("  migrating from hypervisor version %s to %s should"
                           " be safe" % (src_version, dst_version))
        else:
          self.feedback_fn("  migrating from hypervisor version %s to %s is"
                           " likely unsupported" % (src_version, dst_version))
          if self.ignore_hvversions:
            self.feedback_fn("  continuing anyway (told to ignore version"
                             " mismatch)")
          else:
            raise errors.OpExecError("Unsupported migration between hypervisor"
                                     " versions (%s to %s)" %
                                     (src_version, dst_version))

    self.feedback_fn("* checking disk consistency between source and target")
    for (idx, dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)):
      if not CheckDiskConsistency(self.lu, self.instance, dev,
                                  self.target_node_uuid,
                                  False):
        raise errors.OpExecError("Disk %s is degraded or not fully"
                                 " synchronized on target node,"
                                 " aborting migration" % idx)

    if self.current_mem > self.tgt_free_mem:
      if not self.allow_runtime_changes:
        raise errors.OpExecError("Memory ballooning not allowed and not enough"
                                 " free memory to fit instance %s on target"
                                 " node %s (have %dMB, need %dMB)" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(self.target_node_uuid),
                                  self.tgt_free_mem, self.current_mem))
      self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem)
      rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node,
                                                     self.instance,
                                                     self.tgt_free_mem)
      rpcres.Raise("Cannot modify instance runtime memory")

    # First get the migration information from the remote node
    result = self.rpc.call_migration_info(self.source_node_uuid, self.instance)
    msg = result.fail_msg
    if msg:
      log_err = ("Failed fetching source migration information from %s: %s" %
                 (self.cfg.GetNodeName(self.source_node_uuid), msg))
      logging.error(log_err)
      raise errors.OpExecError(log_err)

    self.migration_info = migration_info = result.payload

    disks = self.cfg.GetInstanceDisks(self.instance.uuid)

    self._CloseInstanceDisks(self.target_node_uuid)

    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      # Then switch the disks to master/master mode
      self._GoStandalone()
      self._GoReconnect(True)
      self._WaitUntilSync()

    self._OpenInstanceDisks(self.source_node_uuid, False)
    self._OpenInstanceDisks(self.target_node_uuid, False)

    self.feedback_fn("* preparing %s to accept the instance" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    result = self.rpc.call_accept_instance(self.target_node_uuid,
                                           self.instance,
                                           migration_info,
                                           self.nodes_ip[self.target_node_uuid])

    msg = result.fail_msg
    if msg:
      logging.error("Instance pre-migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Pre-migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* migrating instance to %s" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    cluster = self.cfg.GetClusterInfo()
    result = self.rpc.call_instance_migrate(
        self.source_node_uuid, cluster.cluster_name, self.instance,
        self.nodes_ip[self.target_node_uuid], self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* starting memory transfer")
    last_feedback = time.time()

    cluster_migration_caps = \
      cluster.hvparams.get("kvm", {}).get(constants.HV_KVM_MIGRATION_CAPS, "")
    migration_caps = \
      self.instance.hvparams.get(constants.HV_KVM_MIGRATION_CAPS,
                                 cluster_migration_caps)
    # migration_caps is a ':' delimited string, so checking
    # if 'postcopy-ram' is a substring also covers using
    # x-postcopy-ram for QEMU 2.5
    postcopy_enabled = "postcopy-ram" in migration_caps
    while True:
      result = self.rpc.call_instance_get_migration_status(
                 self.source_node_uuid, self.instance)
      msg = result.fail_msg
      ms = result.payload   # MigrationStatus instance
      if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
        self.feedback_fn("Migration failed, aborting")
        self._AbortMigration()
        self._RevertDiskStatus()
        if not msg:
          msg = "hypervisor returned failure"
        raise errors.OpExecError("Could not migrate instance %s: %s" %
                                 (self.instance.name, msg))

      if (postcopy_enabled
          and ms.status == constants.HV_MIGRATION_ACTIVE
          and int(ms.dirty_sync_count) >= self._POSTCOPY_SYNC_COUNT_THRESHOLD):
        self.feedback_fn("* finishing memory transfer with postcopy")
        self.rpc.call_instance_start_postcopy(self.source_node_uuid,
                                              self.instance)

      if self.instance.hypervisor == 'kvm':
        migration_active = \
          ms.status in constants.HV_KVM_MIGRATION_ACTIVE_STATUSES
      else:
        migration_active = \
          ms.status == constants.HV_MIGRATION_ACTIVE
      if not migration_active:
        self.feedback_fn("* memory transfer complete")
        break

      if (utils.TimeoutExpired(last_feedback,
                               self._MIGRATION_FEEDBACK_INTERVAL) and
          ms.transferred_ram is not None):
        mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
        self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
        last_feedback = time.time()

      time.sleep(self._MIGRATION_POLL_INTERVAL)

    # Always call finalize on both source and target, they should compose
    # a single operation, consisting of (potentially) parallel steps, that
    # should be always attempted/retried together (like in _AbortMigration)
    # without setting any expecetations in what order they execute.
    result_src = self.rpc.call_instance_finalize_migration_src(
        self.source_node_uuid, self.instance, True, self.live)

    result_dst = self.rpc.call_instance_finalize_migration_dst(
        self.target_node_uuid, self.instance, migration_info, True)

    err_msg = []
    if result_src.fail_msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the source node: %s", result_src.fail_msg)
      err_msg.append(self.cfg.GetNodeName(self.source_node_uuid) + ': '
                     + result_src.fail_msg)

    if result_dst.fail_msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the target node: %s", result_dst.fail_msg)
      err_msg.append(self.cfg.GetNodeName(self.target_node_uuid) + ': '
                     + result_dst.fail_msg)

    if err_msg:
      raise errors.OpExecError(
          "Could not finalize instance migration: %s" % ' '.join(err_msg))

    # Update instance location only after finalize completed. This way, if
    # either finalize fails, the config still stores the old primary location,
    # so we can know which instance to delete if we need to (manually) clean up.
    self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid)
    self.instance = self.cfg.GetInstanceInfo(self.instance_uuid)

    self._CloseInstanceDisks(self.source_node_uuid)
    disks = self.cfg.GetInstanceDisks(self.instance_uuid)
    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      self._WaitUntilSync()
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()
    elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
      self._OpenInstanceDisks(self.target_node_uuid, True)

    # If the instance's disk template is `rbd' or `ext' and there was a
    # successful migration, unmap the device from the source node.
    unmap_types = (constants.DT_RBD, constants.DT_EXT)

    if utils.AnyDiskOfType(disks, unmap_types):
      unmap_disks = [d for d in disks if d.dev_type in unmap_types]
      disks = ExpandCheckDisks(unmap_disks, unmap_disks)
      self.feedback_fn("* unmapping instance's disks %s from %s" %
                       (utils.CommaJoin(d.name for d in unmap_disks),
                        self.cfg.GetNodeName(self.source_node_uuid)))
      for disk in disks:
        result = self.rpc.call_blockdev_shutdown(self.source_node_uuid,
                                                 (disk, self.instance))
        msg = result.fail_msg
        if msg:
          logging.error("Migration was successful, but couldn't unmap the"
                        " block device %s on source node %s: %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid), msg)
          logging.error("You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid))

    self.feedback_fn("* done")
示例#9
0
  def _ExecMigration(self):
    """Migrate an instance.

    The migrate is done by:
      - change the disks into dual-master mode
      - wait until disks are fully synchronized again
      - migrate the instance
      - change disks on the new secondary node (the old primary) to secondary
      - wait until disks are fully synchronized
      - change disks into single-master mode

    """
    # Check for hypervisor version mismatch and warn the user.
    hvspecs = [(self.instance.hypervisor,
                self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])]
    nodeinfo = self.rpc.call_node_info(
                 [self.source_node_uuid, self.target_node_uuid], None, hvspecs)
    for ninfo in nodeinfo.values():
      ninfo.Raise("Unable to retrieve node information from node '%s'" %
                  ninfo.node)
    (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload
    (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload

    if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
        (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
      src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
      dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
      if src_version != dst_version:
        self.feedback_fn("* warning: hypervisor version mismatch between"
                         " source (%s) and target (%s) node" %
                         (src_version, dst_version))

    self.feedback_fn("* checking disk consistency between source and target")
    for (idx, dev) in enumerate(self.instance.disks):
      if not CheckDiskConsistency(self.lu, self.instance, dev,
                                  self.target_node_uuid,
                                  False):
        raise errors.OpExecError("Disk %s is degraded or not fully"
                                 " synchronized on target node,"
                                 " aborting migration" % idx)

    if self.current_mem > self.tgt_free_mem:
      if not self.allow_runtime_changes:
        raise errors.OpExecError("Memory ballooning not allowed and not enough"
                                 " free memory to fit instance %s on target"
                                 " node %s (have %dMB, need %dMB)" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(self.target_node_uuid),
                                  self.tgt_free_mem, self.current_mem))
      self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem)
      rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node,
                                                     self.instance,
                                                     self.tgt_free_mem)
      rpcres.Raise("Cannot modify instance runtime memory")

    # First get the migration information from the remote node
    result = self.rpc.call_migration_info(self.source_node_uuid, self.instance)
    msg = result.fail_msg
    if msg:
      log_err = ("Failed fetching source migration information from %s: %s" %
                 (self.cfg.GetNodeName(self.source_node_uuid), msg))
      logging.error(log_err)
      raise errors.OpExecError(log_err)

    self.migration_info = migration_info = result.payload

    if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
      # Then switch the disks to master/master mode
      self._EnsureSecondary(self.target_node_uuid)
      self._GoStandalone()
      self._GoReconnect(True)
      self._WaitUntilSync()

    self.feedback_fn("* preparing %s to accept the instance" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    result = self.rpc.call_accept_instance(self.target_node_uuid,
                                           self.instance,
                                           migration_info,
                                           self.nodes_ip[self.target_node_uuid])

    msg = result.fail_msg
    if msg:
      logging.error("Instance pre-migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Pre-migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* migrating instance to %s" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    cluster = self.cfg.GetClusterInfo()
    result = self.rpc.call_instance_migrate(
        self.source_node_uuid, cluster.cluster_name, self.instance,
        self.nodes_ip[self.target_node_uuid], self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* starting memory transfer")
    last_feedback = time.time()
    while True:
      result = self.rpc.call_instance_get_migration_status(
                 self.source_node_uuid, self.instance)
      msg = result.fail_msg
      ms = result.payload   # MigrationStatus instance
      if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
        self.feedback_fn("Migration failed, aborting")
        self._AbortMigration()
        self._RevertDiskStatus()
        if not msg:
          msg = "hypervisor returned failure"
        raise errors.OpExecError("Could not migrate instance %s: %s" %
                                 (self.instance.name, msg))

      if result.payload.status != constants.HV_MIGRATION_ACTIVE:
        self.feedback_fn("* memory transfer complete")
        break

      if (utils.TimeoutExpired(last_feedback,
                               self._MIGRATION_FEEDBACK_INTERVAL) and
          ms.transferred_ram is not None):
        mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
        self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
        last_feedback = time.time()

      time.sleep(self._MIGRATION_POLL_INTERVAL)

    result = self.rpc.call_instance_finalize_migration_src(
               self.source_node_uuid, self.instance, True, self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the source node: %s", msg)
      raise errors.OpExecError("Could not finalize instance migration: %s" %
                               msg)

    self.instance.primary_node = self.target_node_uuid

    # distribute new instance config to the other nodes
    self.cfg.Update(self.instance, self.feedback_fn)

    result = self.rpc.call_instance_finalize_migration_dst(
               self.target_node_uuid, self.instance, migration_info, True)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the target node: %s", msg)
      raise errors.OpExecError("Could not finalize instance migration: %s" %
                               msg)

    if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
      self._EnsureSecondary(self.source_node_uuid)
      self._WaitUntilSync()
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()

    # If the instance's disk template is `rbd' or `ext' and there was a
    # successful migration, unmap the device from the source node.
    if self.instance.disk_template in (constants.DT_RBD, constants.DT_EXT):
      disks = ExpandCheckDisks(self.instance, self.instance.disks)
      self.feedback_fn("* unmapping instance's disks from %s" %
                       self.cfg.GetNodeName(self.source_node_uuid))
      for disk in disks:
        result = self.rpc.call_blockdev_shutdown(self.source_node_uuid,
                                                 (disk, self.instance))
        msg = result.fail_msg
        if msg:
          logging.error("Migration was successful, but couldn't unmap the"
                        " block device %s on source node %s: %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid), msg)
          logging.error("You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid))

    self.feedback_fn("* done")