Пример #1
0
  def _ExecFailover(self):
    """Failover an instance.

    The failover is done by shutting it down on its present node and
    starting it on the secondary.

    """
    if self.instance.forthcoming:
      self.feedback_fn("Instance is forthcoming, just updating the"
                       "  configuration")
      self.cfg.SetInstancePrimaryNode(self.instance.uuid,
                                      self.target_node_uuid)
      return

    primary_node = self.cfg.GetNodeInfo(self.instance.primary_node)

    source_node_uuid = self.instance.primary_node

    if self.instance.disks_active:
      self.feedback_fn("* checking disk consistency between source and target")
      inst_disks = self.cfg.GetInstanceDisks(self.instance.uuid)
      for (idx, dev) in enumerate(inst_disks):
        # for drbd, these are drbd over lvm
        if not CheckDiskConsistency(self.lu, self.instance, dev,
                                    self.target_node_uuid, False):
          if primary_node.offline:
            self.feedback_fn("Node %s is offline, ignoring degraded disk %s on"
                             " target node %s" %
                             (primary_node.name, idx,
                              self.cfg.GetNodeName(self.target_node_uuid)))
          elif not self.ignore_consistency:
            raise errors.OpExecError("Disk %s is degraded on target node,"
                                     " aborting failover" % idx)
    else:
      self.feedback_fn("* not checking disk consistency as instance is not"
                       " running")

    self.feedback_fn("* shutting down instance on source node")
    logging.info("Shutting down instance %s on node %s",
                 self.instance.name, self.cfg.GetNodeName(source_node_uuid))

    result = self.rpc.call_instance_shutdown(source_node_uuid, self.instance,
                                             self.shutdown_timeout,
                                             self.lu.op.reason)
    msg = result.fail_msg
    if msg:
      if self.ignore_consistency or primary_node.offline:
        self.lu.LogWarning("Could not shutdown instance %s on node %s,"
                           " proceeding anyway; please make sure node"
                           " %s is down; error details: %s",
                           self.instance.name,
                           self.cfg.GetNodeName(source_node_uuid),
                           self.cfg.GetNodeName(source_node_uuid), msg)
      else:
        raise errors.OpExecError("Could not shutdown instance %s on"
                                 " node %s: %s" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(source_node_uuid), msg))

    disk_template = self.cfg.GetInstanceDiskTemplate(self.instance.uuid)
    if disk_template in constants.DTS_EXT_MIRROR:
      self._CloseInstanceDisks(source_node_uuid)

    self.feedback_fn("* deactivating the instance's disks on source node")
    if not ShutdownInstanceDisks(self.lu, self.instance, ignore_primary=True):
      raise errors.OpExecError("Can't shut down the instance's disks")

    self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid)
    self.instance = self.cfg.GetInstanceInfo(self.instance_uuid)

    # Only start the instance if it's marked as up
    if self.instance.admin_state == constants.ADMINST_UP:
      self.feedback_fn("* activating the instance's disks on target node %s" %
                       self.cfg.GetNodeName(self.target_node_uuid))
      logging.info("Starting instance %s on node %s", self.instance.name,
                   self.cfg.GetNodeName(self.target_node_uuid))

      disks_ok, _, _ = AssembleInstanceDisks(self.lu, self.instance,
                                             ignore_secondaries=True)
      if not disks_ok:
        ShutdownInstanceDisks(self.lu, self.instance)
        raise errors.OpExecError("Can't activate the instance's disks")

      self.feedback_fn("* starting the instance on the target node %s" %
                       self.cfg.GetNodeName(self.target_node_uuid))
      result = self.rpc.call_instance_start(self.target_node_uuid,
                                            (self.instance, None, None), False,
                                            self.lu.op.reason)
      msg = result.fail_msg
      if msg:
        ShutdownInstanceDisks(self.lu, self.instance)
        raise errors.OpExecError("Could not start instance %s on node %s: %s" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(self.target_node_uuid),
                                  msg))
Пример #2
0
  def _ExecMigration(self):
    """Migrate an instance.

    The migrate is done by:
      - change the disks into dual-master mode
      - wait until disks are fully synchronized again
      - migrate the instance
      - change disks on the new secondary node (the old primary) to secondary
      - wait until disks are fully synchronized
      - change disks into single-master mode

    """
    # Check for hypervisor version mismatch and warn the user.
    hvspecs = [(self.instance.hypervisor,
                self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])]
    nodeinfo = self.rpc.call_node_info(
                 [self.source_node_uuid, self.target_node_uuid], None, hvspecs)
    for ninfo in nodeinfo.values():
      ninfo.Raise("Unable to retrieve node information from node '%s'" %
                  ninfo.node)
    (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload
    (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload

    if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
        (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
      src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
      dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
      if src_version != dst_version:
        self.feedback_fn("* warning: hypervisor version mismatch between"
                         " source (%s) and target (%s) node" %
                         (src_version, dst_version))
        hv = hypervisor.GetHypervisorClass(self.instance.hypervisor)
        if hv.VersionsSafeForMigration(src_version, dst_version):
          self.feedback_fn("  migrating from hypervisor version %s to %s should"
                           " be safe" % (src_version, dst_version))
        else:
          self.feedback_fn("  migrating from hypervisor version %s to %s is"
                           " likely unsupported" % (src_version, dst_version))
          if self.ignore_hvversions:
            self.feedback_fn("  continuing anyway (told to ignore version"
                             " mismatch)")
          else:
            raise errors.OpExecError("Unsupported migration between hypervisor"
                                     " versions (%s to %s)" %
                                     (src_version, dst_version))

    self.feedback_fn("* checking disk consistency between source and target")
    for (idx, dev) in enumerate(self.cfg.GetInstanceDisks(self.instance.uuid)):
      if not CheckDiskConsistency(self.lu, self.instance, dev,
                                  self.target_node_uuid,
                                  False):
        raise errors.OpExecError("Disk %s is degraded or not fully"
                                 " synchronized on target node,"
                                 " aborting migration" % idx)

    if self.current_mem > self.tgt_free_mem:
      if not self.allow_runtime_changes:
        raise errors.OpExecError("Memory ballooning not allowed and not enough"
                                 " free memory to fit instance %s on target"
                                 " node %s (have %dMB, need %dMB)" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(self.target_node_uuid),
                                  self.tgt_free_mem, self.current_mem))
      self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem)
      rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node,
                                                     self.instance,
                                                     self.tgt_free_mem)
      rpcres.Raise("Cannot modify instance runtime memory")

    # First get the migration information from the remote node
    result = self.rpc.call_migration_info(self.source_node_uuid, self.instance)
    msg = result.fail_msg
    if msg:
      log_err = ("Failed fetching source migration information from %s: %s" %
                 (self.cfg.GetNodeName(self.source_node_uuid), msg))
      logging.error(log_err)
      raise errors.OpExecError(log_err)

    self.migration_info = migration_info = result.payload

    disks = self.cfg.GetInstanceDisks(self.instance.uuid)

    self._CloseInstanceDisks(self.target_node_uuid)

    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      # Then switch the disks to master/master mode
      self._GoStandalone()
      self._GoReconnect(True)
      self._WaitUntilSync()

    self._OpenInstanceDisks(self.source_node_uuid, False)
    self._OpenInstanceDisks(self.target_node_uuid, False)

    self.feedback_fn("* preparing %s to accept the instance" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    result = self.rpc.call_accept_instance(self.target_node_uuid,
                                           self.instance,
                                           migration_info,
                                           self.nodes_ip[self.target_node_uuid])

    msg = result.fail_msg
    if msg:
      logging.error("Instance pre-migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Pre-migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* migrating instance to %s" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    cluster = self.cfg.GetClusterInfo()
    result = self.rpc.call_instance_migrate(
        self.source_node_uuid, cluster.cluster_name, self.instance,
        self.nodes_ip[self.target_node_uuid], self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* starting memory transfer")
    last_feedback = time.time()

    cluster_migration_caps = \
      cluster.hvparams.get("kvm", {}).get(constants.HV_KVM_MIGRATION_CAPS, "")
    migration_caps = \
      self.instance.hvparams.get(constants.HV_KVM_MIGRATION_CAPS,
                                 cluster_migration_caps)
    # migration_caps is a ':' delimited string, so checking
    # if 'postcopy-ram' is a substring also covers using
    # x-postcopy-ram for QEMU 2.5
    postcopy_enabled = "postcopy-ram" in migration_caps
    while True:
      result = self.rpc.call_instance_get_migration_status(
                 self.source_node_uuid, self.instance)
      msg = result.fail_msg
      ms = result.payload   # MigrationStatus instance
      if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
        self.feedback_fn("Migration failed, aborting")
        self._AbortMigration()
        self._RevertDiskStatus()
        if not msg:
          msg = "hypervisor returned failure"
        raise errors.OpExecError("Could not migrate instance %s: %s" %
                                 (self.instance.name, msg))

      if (postcopy_enabled
          and ms.status == constants.HV_MIGRATION_ACTIVE
          and int(ms.dirty_sync_count) >= self._POSTCOPY_SYNC_COUNT_THRESHOLD):
        self.feedback_fn("* finishing memory transfer with postcopy")
        self.rpc.call_instance_start_postcopy(self.source_node_uuid,
                                              self.instance)

      if self.instance.hypervisor == 'kvm':
        migration_active = \
          ms.status in constants.HV_KVM_MIGRATION_ACTIVE_STATUSES
      else:
        migration_active = \
          ms.status == constants.HV_MIGRATION_ACTIVE
      if not migration_active:
        self.feedback_fn("* memory transfer complete")
        break

      if (utils.TimeoutExpired(last_feedback,
                               self._MIGRATION_FEEDBACK_INTERVAL) and
          ms.transferred_ram is not None):
        mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
        self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
        last_feedback = time.time()

      time.sleep(self._MIGRATION_POLL_INTERVAL)

    # Always call finalize on both source and target, they should compose
    # a single operation, consisting of (potentially) parallel steps, that
    # should be always attempted/retried together (like in _AbortMigration)
    # without setting any expecetations in what order they execute.
    result_src = self.rpc.call_instance_finalize_migration_src(
        self.source_node_uuid, self.instance, True, self.live)

    result_dst = self.rpc.call_instance_finalize_migration_dst(
        self.target_node_uuid, self.instance, migration_info, True)

    err_msg = []
    if result_src.fail_msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the source node: %s", result_src.fail_msg)
      err_msg.append(self.cfg.GetNodeName(self.source_node_uuid) + ': '
                     + result_src.fail_msg)

    if result_dst.fail_msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the target node: %s", result_dst.fail_msg)
      err_msg.append(self.cfg.GetNodeName(self.target_node_uuid) + ': '
                     + result_dst.fail_msg)

    if err_msg:
      raise errors.OpExecError(
          "Could not finalize instance migration: %s" % ' '.join(err_msg))

    # Update instance location only after finalize completed. This way, if
    # either finalize fails, the config still stores the old primary location,
    # so we can know which instance to delete if we need to (manually) clean up.
    self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid)
    self.instance = self.cfg.GetInstanceInfo(self.instance_uuid)

    self._CloseInstanceDisks(self.source_node_uuid)
    disks = self.cfg.GetInstanceDisks(self.instance_uuid)
    if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
      self._WaitUntilSync()
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()
    elif utils.AnyDiskOfType(disks, constants.DTS_EXT_MIRROR):
      self._OpenInstanceDisks(self.target_node_uuid, True)

    # If the instance's disk template is `rbd' or `ext' and there was a
    # successful migration, unmap the device from the source node.
    unmap_types = (constants.DT_RBD, constants.DT_EXT)

    if utils.AnyDiskOfType(disks, unmap_types):
      unmap_disks = [d for d in disks if d.dev_type in unmap_types]
      disks = ExpandCheckDisks(unmap_disks, unmap_disks)
      self.feedback_fn("* unmapping instance's disks %s from %s" %
                       (utils.CommaJoin(d.name for d in unmap_disks),
                        self.cfg.GetNodeName(self.source_node_uuid)))
      for disk in disks:
        result = self.rpc.call_blockdev_shutdown(self.source_node_uuid,
                                                 (disk, self.instance))
        msg = result.fail_msg
        if msg:
          logging.error("Migration was successful, but couldn't unmap the"
                        " block device %s on source node %s: %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid), msg)
          logging.error("You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid))

    self.feedback_fn("* done")
Пример #3
0
  def _ExecMigration(self):
    """Migrate an instance.

    The migrate is done by:
      - change the disks into dual-master mode
      - wait until disks are fully synchronized again
      - migrate the instance
      - change disks on the new secondary node (the old primary) to secondary
      - wait until disks are fully synchronized
      - change disks into single-master mode

    """
    # Check for hypervisor version mismatch and warn the user.
    hvspecs = [(self.instance.hypervisor,
                self.cfg.GetClusterInfo().hvparams[self.instance.hypervisor])]
    nodeinfo = self.rpc.call_node_info(
                 [self.source_node_uuid, self.target_node_uuid], None, hvspecs)
    for ninfo in nodeinfo.values():
      ninfo.Raise("Unable to retrieve node information from node '%s'" %
                  ninfo.node)
    (_, _, (src_info, )) = nodeinfo[self.source_node_uuid].payload
    (_, _, (dst_info, )) = nodeinfo[self.target_node_uuid].payload

    if ((constants.HV_NODEINFO_KEY_VERSION in src_info) and
        (constants.HV_NODEINFO_KEY_VERSION in dst_info)):
      src_version = src_info[constants.HV_NODEINFO_KEY_VERSION]
      dst_version = dst_info[constants.HV_NODEINFO_KEY_VERSION]
      if src_version != dst_version:
        self.feedback_fn("* warning: hypervisor version mismatch between"
                         " source (%s) and target (%s) node" %
                         (src_version, dst_version))

    self.feedback_fn("* checking disk consistency between source and target")
    for (idx, dev) in enumerate(self.instance.disks):
      if not CheckDiskConsistency(self.lu, self.instance, dev,
                                  self.target_node_uuid,
                                  False):
        raise errors.OpExecError("Disk %s is degraded or not fully"
                                 " synchronized on target node,"
                                 " aborting migration" % idx)

    if self.current_mem > self.tgt_free_mem:
      if not self.allow_runtime_changes:
        raise errors.OpExecError("Memory ballooning not allowed and not enough"
                                 " free memory to fit instance %s on target"
                                 " node %s (have %dMB, need %dMB)" %
                                 (self.instance.name,
                                  self.cfg.GetNodeName(self.target_node_uuid),
                                  self.tgt_free_mem, self.current_mem))
      self.feedback_fn("* setting instance memory to %s" % self.tgt_free_mem)
      rpcres = self.rpc.call_instance_balloon_memory(self.instance.primary_node,
                                                     self.instance,
                                                     self.tgt_free_mem)
      rpcres.Raise("Cannot modify instance runtime memory")

    # First get the migration information from the remote node
    result = self.rpc.call_migration_info(self.source_node_uuid, self.instance)
    msg = result.fail_msg
    if msg:
      log_err = ("Failed fetching source migration information from %s: %s" %
                 (self.cfg.GetNodeName(self.source_node_uuid), msg))
      logging.error(log_err)
      raise errors.OpExecError(log_err)

    self.migration_info = migration_info = result.payload

    if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
      # Then switch the disks to master/master mode
      self._EnsureSecondary(self.target_node_uuid)
      self._GoStandalone()
      self._GoReconnect(True)
      self._WaitUntilSync()

    self.feedback_fn("* preparing %s to accept the instance" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    result = self.rpc.call_accept_instance(self.target_node_uuid,
                                           self.instance,
                                           migration_info,
                                           self.nodes_ip[self.target_node_uuid])

    msg = result.fail_msg
    if msg:
      logging.error("Instance pre-migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Pre-migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not pre-migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* migrating instance to %s" %
                     self.cfg.GetNodeName(self.target_node_uuid))
    cluster = self.cfg.GetClusterInfo()
    result = self.rpc.call_instance_migrate(
        self.source_node_uuid, cluster.cluster_name, self.instance,
        self.nodes_ip[self.target_node_uuid], self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration failed, trying to revert"
                    " disk status: %s", msg)
      self.feedback_fn("Migration failed, aborting")
      self._AbortMigration()
      self._RevertDiskStatus()
      raise errors.OpExecError("Could not migrate instance %s: %s" %
                               (self.instance.name, msg))

    self.feedback_fn("* starting memory transfer")
    last_feedback = time.time()
    while True:
      result = self.rpc.call_instance_get_migration_status(
                 self.source_node_uuid, self.instance)
      msg = result.fail_msg
      ms = result.payload   # MigrationStatus instance
      if msg or (ms.status in constants.HV_MIGRATION_FAILED_STATUSES):
        logging.error("Instance migration failed, trying to revert"
                      " disk status: %s", msg)
        self.feedback_fn("Migration failed, aborting")
        self._AbortMigration()
        self._RevertDiskStatus()
        if not msg:
          msg = "hypervisor returned failure"
        raise errors.OpExecError("Could not migrate instance %s: %s" %
                                 (self.instance.name, msg))

      if result.payload.status != constants.HV_MIGRATION_ACTIVE:
        self.feedback_fn("* memory transfer complete")
        break

      if (utils.TimeoutExpired(last_feedback,
                               self._MIGRATION_FEEDBACK_INTERVAL) and
          ms.transferred_ram is not None):
        mem_progress = 100 * float(ms.transferred_ram) / float(ms.total_ram)
        self.feedback_fn("* memory transfer progress: %.2f %%" % mem_progress)
        last_feedback = time.time()

      time.sleep(self._MIGRATION_POLL_INTERVAL)

    result = self.rpc.call_instance_finalize_migration_src(
               self.source_node_uuid, self.instance, True, self.live)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the source node: %s", msg)
      raise errors.OpExecError("Could not finalize instance migration: %s" %
                               msg)

    self.instance.primary_node = self.target_node_uuid

    # distribute new instance config to the other nodes
    self.cfg.Update(self.instance, self.feedback_fn)

    result = self.rpc.call_instance_finalize_migration_dst(
               self.target_node_uuid, self.instance, migration_info, True)
    msg = result.fail_msg
    if msg:
      logging.error("Instance migration succeeded, but finalization failed"
                    " on the target node: %s", msg)
      raise errors.OpExecError("Could not finalize instance migration: %s" %
                               msg)

    if self.instance.disk_template not in constants.DTS_EXT_MIRROR:
      self._EnsureSecondary(self.source_node_uuid)
      self._WaitUntilSync()
      self._GoStandalone()
      self._GoReconnect(False)
      self._WaitUntilSync()

    # If the instance's disk template is `rbd' or `ext' and there was a
    # successful migration, unmap the device from the source node.
    if self.instance.disk_template in (constants.DT_RBD, constants.DT_EXT):
      disks = ExpandCheckDisks(self.instance, self.instance.disks)
      self.feedback_fn("* unmapping instance's disks from %s" %
                       self.cfg.GetNodeName(self.source_node_uuid))
      for disk in disks:
        result = self.rpc.call_blockdev_shutdown(self.source_node_uuid,
                                                 (disk, self.instance))
        msg = result.fail_msg
        if msg:
          logging.error("Migration was successful, but couldn't unmap the"
                        " block device %s on source node %s: %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid), msg)
          logging.error("You need to unmap the device %s manually on %s",
                        disk.iv_name,
                        self.cfg.GetNodeName(self.source_node_uuid))

    self.feedback_fn("* done")