Exemplo n.º 1
0
    def prereq_runtime_cluster_is_ready_fix(cluster):
        """
    Attempts to ready cluster.

    Raises:
      CurieTestException on failure.
    """
        if not ScenarioUtil.prereq_dependency_has_oob_data(cluster.metadata()):
            raise CurieTestException(
                "Cannot attempt fix without OoB data for cluster")

        unready_nodes = cluster.get_unready_nodes()
        power_on_nodes = []
        for node in unready_nodes:
            if not node.is_powered_on():
                log.info("Powering on node '%s'", node.node_id())
                node.power_on()
                power_on_nodes.append(node)

        if power_on_nodes and not CurieUtil.wait_for(
                lambda: not cluster.get_powered_off_nodes(power_on_nodes,
                                                          sync_with_oob=True),
                "all nodes to be powered on",
                timeout_secs=1200,
                poll_secs=5):
            raise CurieTestException("Timed out waiting for nodes to power on")
        log.info("All nodes are now powered on")

        to_power_on_cvms = []
        for cvm in [vm for vm in cluster.vms() if vm.is_cvm()]:
            if not cvm.is_powered_on():
                to_power_on_cvms.append(cvm)
        log.info("Powering on CVMs: %s",
                 ", ".join([cvm.vm_name() for cvm in to_power_on_cvms]))
        try:
            cluster.power_on_vms(to_power_on_cvms)
        except Exception as exc:
            raise CurieTestException("Failed to power on CVMs: %s" % exc)
        log.info("Powered on all CVMs")

        if not CurieUtil.wait_for(partial(
                ScenarioUtil.prereq_runtime_cluster_is_ready, cluster),
                                  "all nodes to be ready",
                                  timeout_secs=1200,
                                  poll_secs=5):
            raise CurieTestException(
                "Timed out waiting for cluster to be ready")
        log.info("Cluster is now in a ready state")
Exemplo n.º 2
0
    def power_off(self, sync_management_state=True):
        """
    Powers off the node using out-of-band management interface specified in the
    cluster's metadata.

    Args:
      sync_management_state (bool): If true, wait until the management software
      detects the power state is off. This is True by default in order to
      prevent other management server methods that require power to be on from
      failing unexpectedly.

    Raises:
      CurieTestException if no suitable metadata exists, CurieException on
      all other errors.
    """
        log.debug("Powering off node '%s'", self._node_id)
        if not self.power_management_util.power_off():
            raise CurieException(
                CurieError.kInternalError,
                "Failed to power off node '%s'" % self._node_id)
        # If 'sync_management_state', wait until the management server state is
        # synced with the hardware's state.
        if sync_management_state:
            timeout_secs = 40 * 60
            powered_off = CurieUtil.wait_for(
                lambda: not self.is_powered_on_soft(sync_with_oob=True),
                "management server power state to sync to off for node: %s" %
                self.node_id(),
                timeout_secs,
                poll_secs=5)
            if not powered_off:
                raise CurieException(
                    CurieError.kInternalError,
                    "Failed to sync management server power state after 300s")
Exemplo n.º 3
0
 def wait_for_vms_accessible(vms, timeout_secs):
     """
 Wait until all the specified VMs are accessible to run guest OS commands.
 Raises CurieTestException if this takes longer than 'timeout_secs'.
 """
     t1 = time.time()
     t2 = -1
     for xx, vm in enumerate(vms):
         log.info("Waiting for VM %d/%d (%s) to become accessible", xx + 1,
                  len(vms), vm.vm_name())
         if t2 >= 0:
             wait_for_timeout_secs = timeout_secs - (t2 - t1)
         else:
             wait_for_timeout_secs = timeout_secs
         if wait_for_timeout_secs < 0:
             error_msg = "Timeout waiting for VMs to become accessible"
             raise CurieTestException(error_msg)
         if vm.vm_ip() is None:
             error_msg = "IP address not set on Vm object %s" % vm.vm_name()
             raise CurieTestException(error_msg)
         if not CurieUtil.is_ipv4_address(vm.vm_ip()):
             error_msg = "Non-IPv4 address %s on VM %s" % (vm.vm_ip(),
                                                           vm.vm_name())
             raise CurieTestException(error_msg)
         msg = "waiting for VM %s (%s) to become accessible" % \
           (vm.vm_name(), vm.vm_ip())
         if not CurieUtil.wait_for(vm.is_accessible, msg,
                                   wait_for_timeout_secs):
             error_msg = "Timeout waiting for VMs to become accessible"
             raise CurieTestException(error_msg)
         log.info("VM %d/%d (%s) is accessible", xx + 1, len(vms),
                  vm.vm_name())
         t2 = time.time()
Exemplo n.º 4
0
    def prereq_runtime_node_power_fix(cluster):
        """
    Attempt to boot any powered off nodes in 'cluster'. Block until all nodes
    are powered.

    NB: Powered on will not guarantee that the node is ready. There are further
    vendor-specific prereqs which will verify required services, etc.

    Raises:
      CurieTestException on error or timeout.
    """
        to_power_on_nodes = [
            node for node in cluster.nodes() if not node.is_powered_on()
        ]

        log.info("Attempting to power on the following nodes: %s",
                 ", ".join([node.node_id() for node in to_power_on_nodes]))
        for node in to_power_on_nodes:
            node.power_on()

        def nodes_powered_on():
            for node in to_power_on_nodes:
                if not node.is_powered_on():
                    return False
            return True

        if not CurieUtil.wait_for(
                nodes_powered_on, "nodes to be powered on", 600, poll_secs=5):
            raise CurieTestException("Unable to power on nodes prior to test")
Exemplo n.º 5
0
    def sync_power_state_for_nodes(self, nodes, timeout_secs=(40 * 60)):
        """See 'Cluster.sync_power_state_for_nodes' for documentation."""
        log.info("Synchronizing power state for nodes '%s'",
                 ", ".join([node.node_id() for node in nodes]))
        with self._open_vcenter_connection() as vcenter:
            vim_cluster = self._lookup_vim_cluster(vcenter)
            node_power_state_map = self.get_power_state_for_nodes(nodes)
            oob_powered_on, oob_powered_off = \
                self.__verify_mgmt_oob_power_states_match(nodes,
                                                          node_power_state_map)

            def _done_syncing_nodes():
                node_power_state_map = self.get_power_state_for_nodes(nodes)
                oob_powered_on, oob_powered_off = \
                  self.__verify_mgmt_oob_power_states_match(nodes,
                                                            node_power_state_map)
                to_sync_curie_nodes = oob_powered_on + oob_powered_off
                if not to_sync_curie_nodes:
                    return True
                # Reconnect might fail, as hosts may be powered off, but this may
                # force vCenter to refresh their power states.
                to_sync_vim_hosts = vcenter.lookup_hosts(
                    vim_cluster,
                    [node.node_id() for node in to_sync_curie_nodes])
                vcenter.reconnect_hosts(to_sync_vim_hosts)
                return False

            # Ignore the boolean returned by 'wait_for'. Whether or not this
            # succeeds, we want to refresh the values for 'oob_powered_on',
            # 'oob_powered_off' to provide more detailed logging.
            CurieUtil.wait_for(_done_syncing_nodes,
                               "vCenter to sync state for mismatched hosts",
                               timeout_secs=timeout_secs,
                               poll_secs=5)

            node_power_state_map = self.get_power_state_for_nodes(nodes)
            oob_powered_on, oob_powered_off = \
                self.__verify_mgmt_oob_power_states_match(nodes,
                                                          node_power_state_map)
            unsynced_curie_nodes = oob_powered_on + oob_powered_off

            if unsynced_curie_nodes:
                raise CurieTestException(
                    "Unable to sync vCenter power states for %s" % ", ".join(
                        [node.node_id() for node in unsynced_curie_nodes]))

            return node_power_state_map
Exemplo n.º 6
0
 def wait_for_cmd(self,
                  cmd_id,
                  timeout_secs,
                  poll_secs=30,
                  desired_state=CmdStatus.kSucceeded):
     """
 Wait up to 'timeout_secs' for the command with ID 'cmd_id' to reach the
 state 'desired_state'. Returns a CmdStatus protobuf for the command on
 success, else raises CurieTestException if either a timeout occurs or the
 command has reached a terminal state that is different from the desired
 state.
 """
     cmd_status = CurieUtil.wait_for(
         lambda: self.check_cmd(cmd_id, desired_state),
         "command %s" % cmd_id,
         timeout_secs,
         poll_secs=poll_secs)
     if cmd_status is not None:
         return cmd_status
     else:
         raise CurieTestException("Timeout waiting for command '%s'" %
                                  cmd_id)
Exemplo n.º 7
0
    Returns:
      (bool) True on success, else False.
    """
    success = True
    try:
      self.send_racadm_command("serveraction powerup")
    except CurieException:
      log.exception("Power On failed")
      success = False

    if async or not success:
      return success

    return CurieUtil.wait_for(
      self.is_powered_on,
      "IPMI at '%s' to report node powered on" % self.host,
      timeout_secs=600,
      poll_secs=5)

  def power_off(self, async=False):
    """
    Powers off node associated with iDRAC at 'self.host'.

    Args:
      async (bool): Optional. If True, return immediately after command
        succeeds, don't block until power state has changed to off.

    Returns:
      (bool) True on success, else False.
    """
    success = True
Exemplo n.º 8
0
    # NB: IPMI power ops are not necessarily idempotent, so it's necessary to
    # check the power state before issuing the command. This still does not
    # guarantee that we're safe from a potential race.
    if self.is_powered_on():
      return True

    try:
      self.__execute_command_with_retries(["power", "on"])
    except CurieException:
      log.exception("Exception in __execute_command_with_retries")
      return False

    if async:
      return True
    return CurieUtil.wait_for(
      self.is_powered_on,
      "node '%s' to power on" % self.__flags["host"]._value,
      timeout_secs=600, poll_secs=5)

  def power_off(self, async=False):
    """
    Powers off the node associated with 'self.__flags["host"]'.

    Args:
      async (bool): Optional. If False, block until power state is off.

    Returns:
      (bool): True on success, else False.
    """
    # NB: IPMI power ops are not necessarily idempotent, so it's necessary to
    # check the power state before issuing the command. This still does not
    # guarantee that we're safe from a potential race.
Exemplo n.º 9
0
    """
        if nodes is None:
            nodes = self.nodes()
        nodes_being_powered_on = []
        for node in nodes:
            if not node.is_powered_on():
                log.info("Powering on node %s", node.node_id())
                node.power_on()
                nodes_being_powered_on.append(node)
        # Wait for the nodes to become ready.
        if nodes_being_powered_on and not async:
            timeout_secs = timeout_mins * 60
            func = partial(self.check_nodes_ready, nodes_being_powered_on)
            nodes_ready = CurieUtil.wait_for(func,
                                             "%s to be ready" %
                                             nodes_being_powered_on,
                                             timeout_secs,
                                             poll_secs=5)
            if not nodes_ready:
                raise CurieException(
                    CurieError.kTimeout,
                    "Cluster %s not ready after %d minutes" %
                    (self._name, timeout_mins))

    def _curie_metric_to_metric_name(self, curie_metric):
        if self.__CURIE_METRIC_NAME_MAP__ is None:
            raise NotImplementedError("Subclasses must define "
                                      "__CURIE_METRIC_NAME_MAP__")
        metric_name = self.__CURIE_METRIC_NAME_MAP__.get(
            MetricsUtil.metric_name(curie_metric))
        if metric_name is None:
Exemplo n.º 10
0
    def prereq_runtime_vm_storage_is_ready_fix(cluster):
        """
    Attempt to make curie test VM storage available on all nodes.

    Raises:
      CurieTestException on error or timeout.
    """
        metadata = cluster.metadata()
        if metadata.cluster_hypervisor_info.HasField("esx_info"):
            CHECK(
                metadata.cluster_management_server_info.HasField(
                    "vcenter_info"))
            vcenter_info = metadata.cluster_management_server_info.vcenter_info
            datastore_name = vcenter_info.vcenter_datastore_name

            def datastore_visible():
                try:
                    ScenarioUtil.prereq_runtime_vm_storage_is_ready(cluster)
                    return True
                except CurieTestException:
                    pass
            msg = "datastore %s visible on all %s nodes" % \
              (datastore_name, cluster.name())
            # Refresh datastores state on all nodes to try and make the datastore
            # visible from vCenter's perspective.
            log.info("Refreshing datastores on all %s nodes", cluster.name())
            cluster.refresh_datastores()
            if CurieUtil.wait_for(datastore_visible, msg, 60):
                return
            cluster_software_info = metadata.cluster_software_info
            if cluster_software_info.HasField("nutanix_info"):
                client = NutanixRestApiClient.from_proto(
                    cluster_software_info.nutanix_info)
                container_name = None
                for item in client.datastores_get():
                    if item["datastoreName"] == datastore_name:
                        container_name = item["containerName"]
                        break
                if container_name is None:
                    log.warning(
                        "Datastore %s not mounted on any %s nodes, assuming "
                        "container name is the same as the desired datastore "
                        "name", datastore_name, cluster.name())
                    # Assume that the desired datastore has the same name as an existing
                    # container name.
                    container_name = datastore_name
                # Remount the datastore to try and make the datastore visible.
                log.info(
                    "Unmounting and mounting datastore %s (container %s) on %s",
                    datastore_name, container_name, cluster.name())
                try:
                    client.datastores_delete(datastore_name, verify=True)
                except CurieException, ex:
                    if ex.error_code != CurieError.kInvalidParameter:
                        raise
                    # If Prism views the datastore as unmounted, kInvalidParameter is
                    # returned so continue to try and mount the datastore on all nodes.
                client.datastores_create(container_name,
                                         datastore_name=datastore_name)
                cluster.refresh_datastores()
                if not CurieUtil.wait_for(datastore_visible, msg, 60):
                    raise CurieTestException(
                        "Timeout waiting for datastore %s for "
                        "VM storage to become visible on %s" %
                        (datastore_name, cluster.name()))
            elif cluster_software_info.HasField("vsan_info"):
                raise CurieTestException(
                    "VSAN datastore %s not mounted on all %s "
                    "nodes" % (datastore_name, cluster.name()))
            elif cluster_software_info.HasField("generic_info"):
                raise CurieTestException(
                    "Datastore %s not mounted on all %s nodes" %
                    (datastore_name, cluster.name()))
            else:
                raise ValueError("Unknown cluster software info, metadata %s" %
                                 metadata)