def get_goldimage_path(self, image_name, format_str=FORMAT_VMDK, arch=ARCH_X86_64, auto_convert=True): """ Find an image file based on the name, format, and architecture. If the image doesn't exist in the current format, if auto_convert is set and a QCOW2 formatted version of the image exists, attempt to convert to the desired format. Args: image_name (str): The name of the image to get. format_str (str): A format to get the image in. Valid options include raw, qcow2, vmdk or vhdx. arch (str): The target architecture of the image. Valid options include x86_64 or ppc64le. auto_convert (bool): Whether or not an image should be automatically converted from an existing format to the desired format. Returns: str: Full path to image. Raises: CurieException: If the image doesn't exist or could not be created. """ filename = self.get_goldimage_filename(image_name, format_str, arch) qcow2_filename = self.get_goldimage_filename(image_name, self.FORMAT_QCOW2, arch) qcow2_filepath = os.path.join(self.images_root, qcow2_filename) full_path = os.path.join(self.images_root, filename) if os.path.exists(full_path): return full_path elif os.path.exists(qcow2_filepath): if auto_convert: dest_image_path = qcow2_filepath.split( ".")[0] + "." + format_str rv = self.convert_image_format(qcow2_filepath, dest_image_path, format_str) if rv: raise CurieException( CurieError.kInternalError, "Error attempting to convert image %s from QCOW2 to %s" % (qcow2_filepath, format_str)) return full_path else: raise CurieException( CurieError.kInternalError, "Goldimage %s does not exist in %s. The QCOW2 image does exist, " "however auto-conversion was not attempted." % (filename, self.images_root)) else: raise CurieException( CurieError.kInternalError, "Goldimage %s does not exist. The QCOW2 image also does not " "exist in %s." % (filename, self.images_root))
def __api_cmd_status(self, arg): if flask.request.data == "": raise CurieException(CurieError.kInvalidParameter, "Empty request") cmd_id = arg.cmd_id cmd_dir = self.cmd_dir(cmd_id) ret = charon_agent_interface_pb2.CmdStatusRet() with self.__lock: cmd_state = self.__cmd_map.get(cmd_id) if cmd_state is None: raise CurieException(CurieError.kInvalidParameter, "Command %s not found" % cmd_id) status_path = os.path.join(cmd_dir, "status.bin") ret.cmd_status.ParseFromString(open(status_path).read()) if arg.include_output and ret.cmd_status.state != CmdStatus.kRunning: stdout_path = os.path.join(cmd_dir, "stdout.txt") stderr_path = os.path.join(cmd_dir, "stderr.txt") if ret.cmd_status.HasField("exit_status"): exit_status = ret.cmd_status.exit_status else: log.warning("No exit status for command %s", cmd_id) exit_status = None if os.path.exists(stdout_path): ret.stdout = open(stdout_path).read() else: log.warning( "No stdout for command %s that exited with status %s", cmd_id, exit_status) if os.path.exists(stderr_path): ret.stderr = open(stderr_path).read() else: log.warning( "No stderr for command %s that exited with status %s", cmd_id, exit_status) return ret.SerializeToString()
def import_vm(self, goldimages_directory, goldimage_name, vm_name, node_id=None): """ Creates a VM from the specified gold image. If 'node_id' is specified, the VM is created on that node, else a random node is selected. The VM will be created on the datastore associated with the curie server's settings for this cluster. """ if node_id is None: node_id = random.choice(self.nodes()).node_id() ovfs = glob.glob( os.path.join(goldimages_directory, goldimage_name, "*.ovf")) if len(ovfs) == 0: raise CurieException( CurieError.kInternalError, "Unable to locate .ovf file in '%s'" % os.path.join(goldimages_directory, goldimage_name)) elif len(ovfs) > 1: raise CurieException( CurieError.kInternalError, "Unique .ovf file expected. Found: '%s'" % ovfs) vm = self.__vm_json_to_curie_vm( self._prism_client.deploy_ovf(vm_name, node_id, self._container_id, ovf_abs_path=ovfs[0], network_uuids=[self._network_id])) self.__vm_uuid_host_uuid_map[vm.vm_id()] = node_id vm._node_id = node_id return vm
def power_off(self, sync_management_state=True): """ Powers off the node using out-of-band management interface specified in the cluster's metadata. Args: sync_management_state (bool): If true, wait until the management software detects the power state is off. This is True by default in order to prevent other management server methods that require power to be on from failing unexpectedly. Raises: CurieTestException if no suitable metadata exists, CurieException on all other errors. """ log.debug("Powering off node '%s'", self._node_id) if not self.power_management_util.power_off(): raise CurieException( CurieError.kInternalError, "Failed to power off node '%s'" % self._node_id) # If 'sync_management_state', wait until the management server state is # synced with the hardware's state. if sync_management_state: timeout_secs = 40 * 60 powered_off = CurieUtil.wait_for( lambda: not self.is_powered_on_soft(sync_with_oob=True), "management server power state to sync to off for node: %s" % self.node_id(), timeout_secs, poll_secs=5) if not powered_off: raise CurieException( CurieError.kInternalError, "Failed to sync management server power state after 300s")
def discover_clusters_vcenter(address, username, password, ret): """ Args: address (str): Address of the management server. username (str): Name of user of the management server. password (str): Password of user of the management server ret (DiscoverClustersV2Ret): Return proto to be populated. """ conn = None try: conn = SmartConnectNoSSL(host=address, user=username, pwd=password) log.debug("Connected to vCenter %s", address) vim_inventory_map = \ DiscoveryUtil.compute_vcenter_cluster_inventory(conn.content.rootFolder) DiscoveryUtil._fill_vcenter_cluster_inventory_v2( vim_inventory_map, ret) for cluster_collection in ret.cluster_inventory.cluster_collection_vec: for cluster in cluster_collection.cluster_vec: cluster.management_server.type = cluster.management_server.kVcenter cluster.management_server.version = conn.content.about.version except socket.error: # The plain old socket errors don't indicate which connection. raise CurieException(CurieError.kInternalError, "Could not connect to vCenter at %s" % address) except vim.fault.InvalidLogin: # The failures back from vSphere don't provide the best experience. raise CurieException(CurieError.kInternalError, "Incorrect username or password for vCenter at %s" %address) finally: if conn is not None: Disconnect(conn)
def _send_rpc_sync_with_retries(self, method_name, arg, initial_timeout_secs, max_retries): """ Sends RPC as in '_send_rpc_sync'. On a CurieException, retries the RPC up to 'max_retries' times, starting with a delay of 'initial_timeout_secs' seconds and using exponential backoff on subsequent retries. Args: method_name (str): Name of RPC method to be issued. arg (protobuf): Populated argument proto for the RPC 'method_name'. initial_timeout_secs (numeric): Timeout for initial RPC attempt. Subsequent attempts will scale this value using exponential backoff. max_retries (int): Maximum number of retry attempts to allow. Set to 0 if no retry is desired. """ # TODO: In the case of the CurieUnixAgent it's possible this may fail # with 'cmd_id already exists'. See if there are reasonable cases where # this will occur and handle them appropriately. rpc_excs = [] curr_timeout_secs = initial_timeout_secs for ii in range(max_retries + 1): try: ret, err = self._send_rpc_sync(method_name, arg, curr_timeout_secs) # TODO: See about the convention for multiple error codes. if err and err.error_codes[-1] == CurieError.kRetry: raise CurieException(err.error_codes[-1], str(err)) return ret, err except CurieException as exc: if exc.error_code not in [CurieError.kRetry, CurieError.kTimeout]: raise rpc_excs.append("Attempt %s: %s" % (ii, str(exc))) if ii < max_retries: if exc.error_code == CurieError.kRetry: log.warning( "RPC failed. Retrying after '%s' seconds (%s of %s attempts)", curr_timeout_secs, 1 + ii, max_retries) time.sleep(curr_timeout_secs) else: CHECK_EQ(exc.error_code, CurieError.kTimeout) log.warning( "RPC timed out. Retrying (%s of %s attempts)", 1 + ii, max_retries) # Increase the timeout for the next attempt using exponential # backoff. We impose a cap of # CURIE_CLIENT_DEFAULT_RETRY_TIMEOUT_CAP_SECS secs on this unless # the base RPC timeout is already greater than that. curr_timeout_secs = min( 2 * curr_timeout_secs, max(CURIE_CLIENT_DEFAULT_RETRY_TIMEOUT_CAP_SECS, initial_timeout_secs)) rpc_excs.append("Exhausted retry attempts") raise CurieException(CurieError.kInternalError, "RPC failed:\n%s" % "\n".join(rpc_excs))
def __is_ready(self): """ See public method 'is_ready' documentation for further details. """ host = self.__api.hosts_get(host_ip=self.__node.node_ip(), projection="HEALTH") state = host.get("state").strip().upper() log.debug("Host '%s' in state: %s", self.__node.node_id(), state) if state != "NORMAL": log.warning("Host '%s' reports abnormal health state: %s", self.__node.node_id(), state) return False ret = self.__api.vms_get(cvm_only=True) if not ret["entities"]: raise CurieException(CurieError.kClusterApiError, "No CVM entities found") cvm_ip = host["serviceVMExternalIP"] found_ip = False for cvm_dto in ret["entities"]: for ip_address in cvm_dto["ipAddresses"]: if ip_address.startswith(cvm_ip): found_ip = True log.debug("found entity for cvm: %s", cvm_ip) break if found_ip: break else: raise CurieException(CurieError.kClusterApiError, "No entity for CVM: %s" % cvm_ip) # Pylint doesn't understand for/else raising if cvm_dto is undefined. # pylint: disable=undefined-loop-variable log.info("CVM %s power state %s", cvm_ip, cvm_dto["powerState"]) if cvm_dto["powerState"].strip().lower() != "on": return False try: status = self.__api.genesis_node_services_status() # Check overall node status. If it's determined we only care about a # subset of services, we can alternatively iterate through the list # 'services' to check PIDs and error messages per-service. # State may be a CSV list such as "Up, Zeus Leader". return "up" in [ val.strip() for val in status["state"].strip().lower().split(",") ] except Exception: # If Prism is not up and running, the above will raise an exception # when failing to successfully connect and issue the RPC. log.warning("Failed to query Genesis on node '%s'", self.__node.node_id(), exc_info=True) return False
def cmd_execute_sync(self, arg, timeout_secs, include_output=False): """ Given the CmdExecute request 'arg', simulate synchronous execution by sending a CmdExecute RPC to start the command, polling until the command reaches a terminal state, then returning an (exit_status, stdout, stderr) tuple for the command. 'include_output' specifies whether stdout and stderr should be returned or not (both are None if this is set to False). """ curie_ex = None try: t1 = time.time() # Send a CmdExecute RPC to start the command. execute_ret, execute_err = self.CmdExecute(arg) if execute_err is not None: raise CurieException(execute_err.error_codes[0], execute_err.error_msgs[0]) status_arg = charon_agent_interface_pb2.CmdStatusArg() status_arg.cmd_id = arg.cmd_id status_arg.include_output = include_output status_ret = None # Poll until the command reaches a terminal state or it times out. while True: status_ret, status_err = self.CmdStatus(status_arg) if status_err is not None: raise CurieException(status_err.error_codes[0], status_err.error_msgs[0]) if (status_ret.cmd_status.state != charon_agent_interface_pb2.CmdStatus.kRunning): # Command is in a terminal state. break t2 = time.time() if (t2 - t1) > timeout_secs: raise CurieException( CurieError.kTimeout, "Timeout waiting for command %s" % arg.cmd_id) time.sleep(1) # Check that we have the exit status for the command. CHECK(status_ret is not None) if not status_ret.cmd_status.HasField("exit_status"): raise CurieException( CurieError.kInternalError, "Missing exit status for command %s" % arg.cmd_id) # Return an (exit_status, stdout, stderr) tuple for the command. exit_status = status_ret.cmd_status.exit_status stdout = status_ret.stdout if status_ret.HasField( "stdout") else None stderr = status_ret.stderr if status_ret.HasField( "stderr") else None return (exit_status, stdout, stderr) except CurieException, ex: curie_ex = ex raise
def test_validate_oob_config(self, mock_ping, mock_status): proto_patch_encryption_support(CurieSettings) cluster_pb = CurieSettings.Cluster() for ii in xrange(4): node_pb = cluster_pb.cluster_nodes.add() node_pb.CopyFrom(self._no_oob_node_proto) node_pb.id = str(ii) DiscoveryUtil.validate_oob_config(cluster_pb) self.assertEqual(mock_ping.call_count, 0) self.assertEqual(mock_status.call_count, 0) cluster_pb = CurieSettings.Cluster() for ii in xrange(4): node_pb = cluster_pb.cluster_nodes.add() node_pb.CopyFrom(self._ipmi_node_proto) node_pb.id = str(ii) mock_ping.return_value = True DiscoveryUtil.validate_oob_config(cluster_pb) self.assertEqual(mock_ping.call_count, len(cluster_pb.cluster_nodes)) self.assertEqual(mock_status.call_count, len(cluster_pb.cluster_nodes)) mock_ping.reset_mock() mock_status.reset_mock() mock_ping.side_effect = [True, False, True, True] with self.assertRaises(CurieException): DiscoveryUtil.validate_oob_config(cluster_pb) # We expect that the first ping succeeds and then the second fails. There # should be an exception after the second ping attempt. If ping fails, the # expectations is then that the chassis status won't be called. self.assertEqual(mock_ping.call_count, 2) self.assertEqual(mock_status.call_count, 1) mock_ping.reset_mock() mock_status.reset_mock() mock_ping.return_value = True mock_ping.side_effect = None mock_status.side_effect = [{}, CurieException( CurieError.kOobAuthenticationError, "AuthError"), {}, CurieException(CurieError.kInternalError, "SomeOtherError")] with self.assertRaises(CurieException): DiscoveryUtil.validate_oob_config(cluster_pb) self.assertEqual(mock_ping.call_count, 2) self.assertEqual(mock_status.call_count, 2)
def _send_rpc_sync(self, method_name, arg, timeout_secs): """ Synchronously issue RPC 'method_name' with argument 'arg'. Args: method_name (str): Name of RPC method to be issued. arg (protobuf): Populated argument proto for the RPC 'method_name'. timeout_secs (float): Desired RPC timeout in seconds. Expected a non-negative value coercable to float. Returns: (ret, err) On success, 'ret' is the appropriate deserialized return proto and 'err' is None. On error, 'ret' is None, and err is the deserialized CurieError proto. Raises: (CurieError<kTimeout>) on timeout. (CurieError<kRetry>) on connection error. """ try: resp = requests.post( self.__url, headers={"Content-Type": "application/x-rpc", "X-Rpc-Method": method_name}, data=arg.SerializeToString(), timeout=timeout_secs) except (ValueError, TypeError) as exc: log.exception("Failed to issue RPC") raise CurieException(CurieError.kInternalError, "Failed to issue RPC: %s" % exc) except (requests.exceptions.Timeout, socket.timeout): log.exception("RPC timed out") raise CurieException(CurieError.kTimeout, "RPC '%s' timed out after %f seconds" % ( method_name, timeout_secs)) except requests.exceptions.RequestException as exc: log.exception("Exception in RPC request") raise CurieException(CurieError.kRetry, str(exc)) else: # Succeeded, expect appropriate serialized return proto. if resp.status_code == 200: ret_cls = self._service.get_ret_proto(method_name) ret = ret_cls() ret.ParseFromString(resp.content) return ret, None # Error, expect serialized CurieError proto. else: err = ErrorRet() err.ParseFromString(resp.content) return None, err
def test_update_metadata_if_cluster_contains_extra_nodes( self, m_NutanixRestApiClient): m_prism_client = mock.MagicMock(spec=NutanixRestApiClient) m_NutanixRestApiClient.from_proto.return_value = m_prism_client def fake_clusters_get(**kwargs): cluster_data = {"clusterUuid": "fake-cluster-id"} if kwargs.get("cluster_id"): return cluster_data else: return {"entities": [cluster_data]} m_prism_client.clusters_get.side_effect = fake_clusters_get m_prism_client.hosts_get.return_value = { "entities": [ { "clusterUuid": "fake-cluster-id", "uuid": "fake_node_uuid_0" }, { "clusterUuid": "fake-cluster-id", "uuid": "fake_node_uuid_1" }, { "clusterUuid": "fake-cluster-id", "uuid": "fake_node_uuid_2" }, { "clusterUuid": "fake-cluster-id", "uuid": "fake_node_uuid_3" }, ] } extra_node = self.cluster_metadata.cluster_nodes.add() extra_node.id = "fake_node_extra" cluster = AcropolisCluster(self.cluster_metadata) with mock.patch.object(cluster, "identifier_to_node_uuid") as m_itnu: m_itnu.side_effect = ["fake_node_uuid_0", "fake_node_uuid_1", "fake_node_uuid_2", "fake_node_uuid_3", CurieException(CurieError.kInvalidParameter, "Unable to locate host.")] with self.assertRaises(CurieTestException) as ar: cluster.update_metadata(False) self.assertIn( "Cause: Node with ID 'fake_node_extra' is in the Curie cluster " "metadata, but not found in the AHV cluster.\n" "\n" "Impact: The cluster configuration is invalid.\n" "\n" "Corrective Action: Please check that all of the nodes in the Curie " "cluster metadata are part of the AHV cluster. For example, if the " "cluster configuration has four nodes, please check that all four nodes " "are present in the AHV cluster.\n" "\n" "Traceback (most recent call last):", str(ar.exception))
def _update_cluster_version_info_prism(cluster_pb): """ See 'DiscoveryUtil.update_cluster_version_info' for info. """ mgmt_info = cluster_pb.cluster_management_server_info.prism_info software_info = cluster_pb.cluster_software_info.nutanix_info hyp_info = cluster_pb.cluster_hypervisor_info.ahv_info cli = NutanixRestApiClient.from_proto(mgmt_info, timeout_secs=10) DiscoveryUtil._update_cluster_version_info_nos(cli, cluster_pb) mgmt_info.prism_version = software_info.version for host in cli.hosts_get().get("entities", []): if host["clusterUuid"] != software_info.cluster_uuid: continue # We only support homogeneous AHV clusters via Prism. if host.get("hypervisorType") != "kKvm": raise CurieException(CurieError.kInvalidParameter, "Specified cluster is mixed hypervisor") # Strip any "Nutanix " prefix from AHV version strings. curr_hyp_version = re.sub( "^Nutanix ", "", DiscoveryUtil._get_hyp_version_for_host(host)) hyp_info.version.extend([curr_hyp_version])
def __send_racadm_command_with_retries(self, cmd, max_retries=5): """ Issue 'cmd', retrying on failure up to 'max_retries' times. Interval between calls has exponential backoff applied up to a cap of '_MAX_RETRY_INTERVAL_SECS'. Args: cmd (str): Command to execute. max_retries (int): Optional. Maximum number of retry attempts. Returns: (dict) parsed XML response Raises: CurieException on error. """ curr_retry_interval_secs = 1 for ii in range(max_retries + 1): try: return self.__send_racadm_command(cmd) except CurieException as exc: log.exception("'%s' failed", cmd) if ii < max_retries: log.info("Retrying (%d of %d attempts)", ii + 1, max_retries) if exc.error_code == CurieError.kOobAuthenticationError: log.debug("Possible session expiration, reauthenticating") self.__get_session_cookie(cached_ok=False) curr_retry_interval_secs = min( self._MAX_RETRY_INTERVAL_SECS, 2 * curr_retry_interval_secs) time.sleep(curr_retry_interval_secs) raise CurieException( CurieError.kInternalError, "Failed to execute '%s' after %d retries" % (cmd, max_retries))
def identifier_to_node_uuid(cls, rest_client, node_id_name_or_ip): # These will raise appropriate exceptions on failure, so it's safe to # assume that otherwise accessing the 'uuid' key is safe. if CurieUtil.is_ipv4_address(node_id_name_or_ip): return rest_client.hosts_get(host_ip=node_id_name_or_ip)["uuid"] elif CurieUtil.is_uuid(node_id_name_or_ip): try: return rest_client.hosts_get( host_id=node_id_name_or_ip)["uuid"] except Exception: log.debug("Failed to lookup node via UUID '%s'", node_id_name_or_ip) # The provided node identifier is not an IPv4 address or a UUID. It may # be either an unresolved hostname or a Prism name. Try Prism name first # to avoid potential overhead in name resolution. try: return rest_client.hosts_get(host_name=node_id_name_or_ip)["uuid"] except Exception: log.debug("Failed to lookup node via Prism name '%s'", node_id_name_or_ip) try: ip = CurieUtil.resolve_hostname(node_id_name_or_ip) except Exception: raise CurieException( CurieError.kInvalidParameter, "Unable to resolve IP address for '%s'" % node_id_name_or_ip) # Allow this to raise it's own exception on failure, as there are no # further methods to which we can fall back. return rest_client.hosts_get(host_ip=ip)["uuid"]
def __get_session_cookie(self, cached_ok=True, max_retries=5): """ Initialize new session if necessary, and return a session cookie. Args: cached_ok (bool): Optional. If True, use cached session if present. max_retries (int): Maximum number of retries on failure. Returns: (dict) Map "Cookie" -> <formatted session cookie> """ with self.LOCK: if cached_ok: session_id = self.HOST_SESSION_ID_MAP.get(self.host) if session_id: return {"Cookie": "sid=%s; path=/cgi-bin/" % session_id} for ii in range(max_retries): try: session_id = self.__authenticate() break except CurieException as exc: log.debug("Failed to establish session: %s (%d of %d attempts)", exc, ii + 1, max_retries) time.sleep(5) else: err_msg = ("Failed to establish valid iDRAC session within %d attempts" % max_retries) log.error(err_msg) raise CurieException(CurieError.kInternalError, err_msg) self.HOST_SESSION_ID_MAP[self.host] = session_id return {"Cookie": "sid=%s; path=/cgi-bin/" % session_id}
def __parse_response(self, raw_resp): """ Parse 'raw_resp' whose content is expected to be an XML-formatted response. Args: raw_resp (requests.Response): response to parse. Returns: (dict) parsed response. Raises: CurieException<kInvalidParameter> on error. """ error = "" try: root = etree.fromstring(raw_resp.content) assert root.tag == self.node().getroottree().getroot().tag, ( "Invalid response, root tags do not match between request and " "response. (req: %s, resp: %s)" % (self.__node.getroottree().getroot().tag, root.tag)) resp_node = root.getchildren() except (AssertionError, etree.Error) as exc: error = str(exc) if len(resp_node) == 0: error = "Response content missing <RESP> body" elif len(resp_node) > 1: error = "Invalid response content returned: '%s'" % raw_resp.content elif resp_node[0].tag != "RESP": error = ("Invalid response. First child is not a <RESP> tag. (found %s)" % resp_node[0].tag) if error: raise CurieException(CurieError.kInternalError, error) return RacAdmResponse(resp_node[0])
def _validate(name, val, type_or_types, values, func, err_code, *args, **kwargs): """ Helper used by 'validate_parameter' and 'validate_return'. See either for more documentation. """ log.trace("Validating '%s': %s", name, val) log.trace("valid_types=%s, valid_values=%s, valid_func=%s", type_or_types, values, func) try: if type_or_types and not isinstance(val, type_or_types): if isinstance(type_or_types, collections.Iterable): msg = "Expected one of '%s'" % ", ".join( [type.__name__ for type in type_or_types]) else: msg = "Expected '%s'" % type_or_types.__name__ raise AssertionError("Invalid '%s' type '%s': %s" % (name, val.__class__.__name__, msg)) if values and val not in values: if isinstance(values, list) or isinstance(values, tuple): msg = "Expected one of '%s'" % ", ".join(values) else: msg = "Expected '%s'" % values raise AssertionError("Invalid '%s' value '%s': %s" % (name, val, msg)) if func: if not func(*args, **kwargs): raise AssertionError("Invalid '%s' value '%s': Functional validation " "failed" % (name, val)) except AssertionError as exc: raise CurieException(err_code, str(exc))
def send(self): """ Issues command represented by this instance to iDRAC at 'self.host'. Returns: (dict) parsed XML response. Raises: CurieException<kInvalidParameter> on error. """ cmd_type = "exec" if self._cookie else "login" path = self.URL.format(idrac_ip=self.host, cmd_type=cmd_type) # Don't log raw login command as credentials are in plaintext. if cmd_type != "login": log.trace("Sending XML-HTTP request to: %s\n" "\tHeaders: %s\n" "\tBody: %s", path, self.headers(), self.xml()) raw_resp = requests.post(path, data=self.xml(), headers=self.headers(), verify=False) # pylint takes issue with the requests.status_codes.codes LookupDict. # pylint: disable=no-member if raw_resp.status_code != requests.status_codes.codes.OK: raise CurieException( CurieError.kInternalError, "Error sending XML-HTTP request to iDRAC: %s %s" % (raw_resp.status_code, raw_resp.reason)) log.trace("Received raw response:\n" "Headers: %s\n" "Body: %s", raw_resp.headers, raw_resp.content) return self.__parse_response(raw_resp)
def __api_cmd_remove(self, arg): if flask.request.data == "": raise CurieException(CurieError.kInvalidParameter, "Empty request") cmd_id = arg.cmd_id cmd_dir = self.cmd_dir(cmd_id) with self.__lock: ret = charon_agent_interface_pb2.CmdRemoveRet() cmd_state = self.__cmd_map.get(cmd_id) if cmd_state is None: # It's possible this is a retry of a request that previously succeeded. log.warning("Command %s not found", cmd_id) return ret.SerializeToString() if cmd_state.pid is not None: # Kill the command wrapper process group which will kill the # curie_cmd_wrapper process and all of its descendants. log.info("Killing command %s as part of a remove, PID %d", cmd_id, cmd_state.pid) try: os.killpg(cmd_state.pid, signal.SIGKILL) except OSError, ex: CHECK_EQ(ex.errno, errno.ESRCH, msg=str(ex)) # Move the command's directory to the garbage directory to be garbage # collected later. garbage_cmd_dir = os.path.join(self.__garbage_dir, os.path.basename(cmd_dir)) os.rename(cmd_dir, garbage_cmd_dir) del self.__cmd_map[cmd_id] return ret.SerializeToString()
def __init__(self, valid_types=None, valid_values=None, valid_func=None, err_code=CurieError.kInvalidParameter): """ Validates return by type, value, and/or using a custom function. Args: valid_types (list|None): If not None, verify that the parameter is of one of the provided types. valid_values (list|None): If not None, verify that the parameter is equal to one of the provided values. valid_func (callable|None): If not None, a callable accepting the wrapped function's arguments and raising an AssertionError if the return value is invalid. err_code (CurieError.Type): Optional. CurieException error code to use on failure. Raises CurieException<kInvalidParameter> If parameter fails validation. """ self.valid_types = valid_types self.valid_values = valid_values self.valid_func = valid_func self.err_code = err_code if valid_func and not callable(valid_func): raise CurieException(CurieError.kInternalError, "Skipping uncallable object provided for return " "value validation")
def __api_cmd_execute(self, arg): if flask.request.data == "": raise CurieException(CurieError.kInvalidParameter, "Empty request") try: cmd_uid = pwd.getpwnam(arg.user).pw_uid except KeyError: raise CurieException(CurieError.kInvalidParameter, "Invalid user %s" % arg.user) ret = charon_agent_interface_pb2.CmdExecuteRet() with self.__lock: if arg.cmd_id not in self.__cmd_map: self.__execute_cmd(arg.cmd_id, arg.cmd, cmd_uid) else: # It's possible this is a retry of a request that previously succeeded. log.warning("Command %s already exists", arg.cmd_id) return ret.SerializeToString()
def __execute_command_with_retries(self, cmd, max_retries=5): """ Executes 'cmd', retrying on error up to 'max_retries' times. Interval between calls has exponential backoff applied up to a cap of '_MAX_RETRY_INTERVAL_SECS'. Returns: (tuple): (stdout, stderr) Raises: CurieException if 'cmd' does not succeed within 'max_retries' + 1 calls. """ curr_retry_interval_secs = 1 for ii in range(max_retries + 1): try: rv, stdout, stderr = self.__execute_command(cmd) if rv == 0: return stdout, stderr error_msg = ( "Error executing '%s':\n\trv=%s\n\tstdout=%s\n\tstderr=%s" % (cmd, rv, stdout, stderr)) except CurieException as exc: error_msg = "'%s' failed: '%s'" % (cmd, exc) if ii < max_retries: log.error(error_msg) log.info("Retrying (%d of %d retries)", ii + 1, max_retries) curr_retry_interval_secs = min(self._MAX_RETRY_INTERVAL_SECS, 2 * curr_retry_interval_secs) time.sleep(curr_retry_interval_secs) else: raise CurieException(CurieError.kInternalError, error_msg)
def validate_host_connectivity(cluster_pb): cluster_cls = get_cluster_class(cluster_pb) cluster = cluster_cls(cluster_pb) for node in cluster.nodes(): if not CurieUtil.ping_ip(node.node_ip()): raise CurieException(CurieError.kInternalError, "Host %s - %s not reachable." % (node.node_id(), node.node_ip()))
def _CHECK_BINARY_OP(x, y, op, symbol, msg="", **kwargs): if op(x, y): return output = ["%s %s %s failed," % (x, symbol, y)] if msg: output.append("%s," % msg) raise CurieException(CurieError.kInternalError, output)
def get_management_software_value_for_attribute(cls, attr): """ Get management software specific value for 'attr'. """ if getattr(NodePropertyNames, attr) is None: raise CurieException(CurieError.kInvalidParameter, "Unknown node property '%s'" % attr) return cls.get_management_software_property_name_map()[attr]
def test_RunCommand_fail_on_error(self): for vm in self.vms: vm.execute_sync.return_value = (1, "stdout", "stderr") vm.execute_sync.side_effect = CurieException( CurieError.kInternalError, "Message") step = steps.vm_group.RunCommand(self.scenario, self.vm_group._name, "ps") with self.assertRaises(CurieException): step()
def _update_cluster_version_info_vcenter(cluster_pb): """ See 'DiscoveryUtil.update_cluster_version_info' for info. """ mgmt_info = cluster_pb.cluster_management_server_info.vcenter_info hyp_info = cluster_pb.cluster_hypervisor_info.esx_info with VsphereVcenter.from_proto(mgmt_info) as vcenter: vim_dc = vcenter.lookup_datacenter(mgmt_info.vcenter_datacenter_name) vim_cluster = vcenter.lookup_cluster(vim_dc, mgmt_info.vcenter_cluster_name) if vim_cluster is None: raise CurieException(CurieError.kInvalidParameter, "Cluster not found in specified vCenter") esx_version_pairs = vcenter.get_esx_versions(vim_cluster) hyp_info.version.extend(pair[0] for pair in esx_version_pairs) hyp_info.build.extend(pair[1] for pair in esx_version_pairs) mgmt_info.vcenter_version, mgmt_info.vcenter_build = \ vcenter.get_vcenter_version_info() if cluster_pb.cluster_software_info.HasField("nutanix_info"): cvms = [vim_vm for vim_vm in vcenter.lookup_vms(vim_cluster) if vcenter.vim_vm_is_nutanix_cvm(vim_vm)] if not cvms: raise CurieException( CurieError.kInvalidParameter, "Unable to locate any CVMs on cluster. Is this a Nutanix cluster?") for cvm in cvms: ip = get_optional_vim_attr(cvm.guest, "ipAddress") if ip and CurieUtil.is_ipv4_address(ip): break else: raise CurieException( CurieError.kInvalidParameter, "Unable to locate any CVMs with IPv4 addresses on cluster") software_info = cluster_pb.cluster_software_info.nutanix_info cli = NutanixRestApiClient( ip, software_info.decrypt_field("prism_user"), software_info.decrypt_field("prism_password")) DiscoveryUtil._update_cluster_version_info_nos(cli, cluster_pb)
def create_vm(self, goldimages_directory, goldimage_name, vm_name, vcpus=1, ram_mb=1024, node_id=None, datastore_name=None, data_disks=()): """ See 'Cluster.create_vm' for documentation. """ log.info( "Creating VM %s based on %s with %d vCPUs, %d MB RAM and %s " "disks on node %s in datastore %s ", vm_name, goldimage_name, vcpus, ram_mb, str(data_disks), str(node_id), datastore_name) image_uuid = self.deploy_goldimage_image_service( goldimages_directory, goldimage_name) # This namedtuple hackery is to handle the expectations in vm.py which # expects information directly parsed from an OVF file. Units = namedtuple("Units", ["multiplier"]) Disk = namedtuple("Disk", ["capacity", "units"]) attach_disks = [ Disk(gb, Units(1024 * 1024 * 1024)) for gb in data_disks ] vm_desc = VmDescriptor(name=vm_name, memory_mb=ram_mb, num_vcpus=vcpus, vmdisk_uuid_list=[image_uuid], attached_disks=attach_disks, container_uuid=self._container_id) # Create the VM log.info("Creating VM '%s' with %s MB RAM and %s vCPUs", vm_desc.name, vm_desc.memory_mb, vm_desc.num_vcpus) nic_specs = \ [vm_desc.to_ahv_vm_nic_create_spec(self._network_id)["specList"][0]] resp = self._prism_client.vms_create(vm_desc, nic_specs) tid = resp.get("taskUuid") if not tid: raise CurieException(CurieError.kManagementServerApiError, "Failed to deploy VM: %s" % resp) TaskPoller.execute_parallel_tasks(tasks=PrismTask.from_task_id( self._prism_client, tid), timeout_secs=60) task_json = self._prism_client.tasks_get_by_id(tid) vm_uuid = task_json["entityList"][0]["uuid"] # Make a Curie VM descriptor and assign it to the requested node vm = self.__vm_json_to_curie_vm( self._prism_client.vms_get_by_id(vm_uuid)) vm._node_id = node_id return vm
def deploy_goldimage_image_service(self, goldimages_directory, goldimage_name): """ Deploy a gold image to the image service. Args: goldimage_name (str): Name of the gold image to deploy. Returns: str: ID of the created disk image. """ arch = self.get_cluster_architecture() # Select a vdisk format to use. Currently PPC64LE goldimages are only built # using qcow2 format and the x86_64 in vmdk. We could have the manager # perform a conversion, but acropolis can already do the image conversion # for us. if arch == GoldImageManager.ARCH_PPC64LE: disk_format = GoldImageManager.FORMAT_QCOW2 else: disk_format = GoldImageManager.FORMAT_VMDK # Use the GoldImage manager to get a path to our appropriate goldimage goldimage_manager = GoldImageManager(goldimages_directory) goldimage_path = goldimage_manager.get_goldimage_path( goldimage_name, format_str=disk_format, arch=arch) log.debug("Deploying %s to cluster", goldimage_path) # Deploy the image to service disk_name = os.path.splitext(os.path.basename(goldimage_path))[0] img_uuid, tid, _ = self._prism_client.images_create( NameUtil.goldimage_vmdisk_name(disk_name, "os"), goldimage_path, self._container_id) TaskPoller.execute_parallel_tasks(tasks=PrismTask.from_task_id( self._prism_client, tid), timeout_secs=3600) # NB: Required due to possible AHV bug. See XRAY-225. num_images_get_retries = 5 for attempt_num in xrange(num_images_get_retries): images_get_data = self._prism_client.images_get(image_id=img_uuid) image_state = images_get_data["image_state"] if image_state.lower() == "active": # Return the disk image return images_get_data["vm_disk_id"] else: log.info( "Waiting for created image to become active " "(imageState: %s, retry %d of %d)", image_state, attempt_num + 1, num_images_get_retries) log.debug(images_get_data) time.sleep(1) else: raise CurieException( CurieError.kInternalError, "Created image failed to become active within " "%d attempts" % num_images_get_retries)
def is_powered_on(self): """ Checks whether chassis power state is 'on'. Returns: (bool) True if powered on, else False. """ raise CurieException( CurieError.kInvalidParameter, "Attempted to make out-of-band management calls in an environment " "which has not been configured to support out-of-band management")