def __init__(self, cluster_metadata): # TODO (jklein): Would be nice to standardize this in a cleaner way. CHECK(cluster_metadata.cluster_hypervisor_info.HasField("ahv_info")) CHECK(cluster_metadata.cluster_software_info.HasField("nutanix_info")) CHECK( cluster_metadata.cluster_management_server_info.HasField( "prism_info")) # Prism information for the PE/PC server that manages this cluster. self._mgmt_server_info = \ cluster_metadata.cluster_management_server_info.prism_info cluster_metadata.cluster_software_info.nutanix_info.prism_host = \ self._mgmt_server_info.prism_host # Map of VM UUIDs to host UUIDs on which they should be scheduled. self.__vm_uuid_host_uuid_map = {} # ID Caches self.__cluster_id = None self.__container_id = None self.__network_id = None self.__host_ip_cvm_ip_map = None super(AcropolisCluster, self).__init__(cluster_metadata)
def __init__(self, vm_params): CHECK(vm_params.cluster) CHECK(vm_params.vm_id) # See the comments in VmParams for these fields. self._cluster = vm_params.cluster self._vm_id = vm_params.vm_id self._vm_name = vm_params.vm_name self._vm_ip = vm_params.vm_ip self._node_id = vm_params.node_id self._is_cvm = vm_params.is_cvm
def __produce_curie_metrics(self, stats_specific_responses, node): responses_by_counter_name = {} for metric in stats_specific_responses: responses_by_counter_name[metric["metric"]] = metric results = [] for curie_metric in self.metrics(): ahv_counter_name = self._curie_metric_to_metric_name(curie_metric) metric = responses_by_counter_name[ahv_counter_name] start_time_secs = int(metric["startTimeInUsecs"] / 1e6) interval_secs = int(metric["intervalInSecs"]) values = metric["values"] offsets = [index * interval_secs for index in range(len(values))] timestamps = [start_time_secs + offset for offset in offsets] # If any values are None, remove it and its corresponding timestamp. timestamp_value_tuples = [ tup for tup in zip(timestamps, values) if tup[1] is not None ] if timestamp_value_tuples: timestamps, values = zip(*timestamp_value_tuples) else: timestamps, values = [], [] result = CurieMetric() result.CopyFrom(curie_metric) # TODO(ryan.hardin): Generalize unit conversion, move to utility module. if result.rate == CurieMetric.kPerSecond: # Convert units per interval into units per second. values = [(value / float(interval_secs)) for value in values] if result.unit == CurieMetric.kPercent: # Assume metric in ppm (parts per million) - convert to percentage. values = [(value / 1e4) for value in values] elif result.unit == CurieMetric.kKilobytes: # Assume metric in bytes - convert to kilobytes. values = [(value / float(2**10)) for value in values] elif (result.unit == CurieMetric.kMegahertz and result.name == CurieMetric.kCpuUsage): # Assume metric in ppm (parts per million) - convert total megahertz. # TODO(ryan.hardin): Should node.cpu_capacity_in_hz ever return None? if node.cpu_capacity_in_hz is None: log.debug("node.cpu_capacity_in_hz returned None") timestamps, values = [], [] else: values = [(cpu_ppm * node.cpu_capacity_in_hz / 1e12) for cpu_ppm in values] CHECK(len(result.timestamps) == 0) result.timestamps.extend(timestamps) CHECK(len(result.values) == 0) result.values.extend([int(value) for value in values]) results.append(result) return results
def curie_error_to_http_status(error_code): """ Returns an HTTP status code corresponding to the curie error code 'error_code'. """ CHECK(error_code in ERROR_CODE_HTTP_STATUS_MAP, msg=error_code) return ERROR_CODE_HTTP_STATUS_MAP[error_code]
def initialize(self): "Initialize agent state." CHECK(not self.__initialized) # Create initial directory structure and empty settings file if needed. if not os.path.exists(FLAGS.curie_agent_dir): self.__setup() # Delete incomplete command directories. cmds_dir = self.cmds_dir() for name in os.listdir(cmds_dir): cmd_dir = os.path.join(cmds_dir, name) arg_path = os.path.join(cmd_dir, "arg.bin") status_path = os.path.join(cmd_dir, "status.bin") if not os.path.exists(arg_path) or not os.path.exists(status_path): log.warning("Deleting incomplete command directory %s", cmd_dir) shutil.rmtree(cmd_dir) # Delete old garbage and create an empty garbage directory. if os.path.exists(self.__garbage_dir): shutil.rmtree(self.__garbage_dir) os.mkdir(self.__garbage_dir) # Recover self.__cmd_map based on state on disk and running commands. self.__recover_cmd_map() # Mark agent as being initialized. self.__initialized = True # Start thread to perform periodic updates of command state and GC. self.__cmd_poller_thread = \ threading.Thread(target=self.__cmd_poller_thread_func) self.__cmd_poller_thread.start()
def import_vm(self, goldimages_directory, goldimage_name, vm_name, node_id=None): """ Creates a VM from the specified gold image. If 'node_id' is specified, the VM is created on that node, else a random node is selected. The VM will be created on the datastore associated with the curie server's settings for this cluster. """ with self._open_vcenter_connection() as vcenter: vim_datacenter, vim_cluster, vim_datastore = \ self._lookup_vim_objects(vcenter) vim_network = None if self._vcenter_info.HasField("vcenter_network_name"): for vim_datacenter_network in vim_datacenter.network: if (vim_datacenter_network.name == self._vcenter_info.vcenter_network_name): vim_network = vim_datacenter_network break CHECK(vim_network is not None, msg=self._vcenter_info.vcenter_network_name) # We use the HostSystem (node) name as a node ID for vSphere clusters. host_name = node_id vim_vm = vcenter.import_vm(goldimages_directory, goldimage_name, vim_datacenter, vim_cluster, vim_datastore, vim_network, vm_name, host_name=host_name) return self.__vim_vm_to_curie_vm(vim_vm)
def check_cmd(self, cmd_id, desired_state=CmdStatus.kSucceeded): """Check if a command has reached a desired state. Args: cmd_id (str): ID of the command to check. desired_state (CmdStatus): Desired command state. Returns: CmdStatus protobuf on success, and None if the command is in progress. Raises: CurieTestException if the command has reached a terminal state that is different from the desired state. """ rpc_client = AgentRpcClient(self._vm_ip) arg = charon_agent_interface_pb2.CmdStatusArg() arg.cmd_id = cmd_id ret, err = rpc_client.CmdStatus(arg) if ret.cmd_status.state == desired_state: # Command has reached the desired state. The desired state could be a # non-terminal state or a terminal state. cmd_status = ret.cmd_status CHECK(cmd_status) return cmd_status elif ret.cmd_status.state != CmdStatus.kRunning: # Command has reached a terminal state. If we're here, this implies # that the command's terminal state is not the desired state because # if it was, then we would have already returned above. CHECK_NE(ret.cmd_status.state, desired_state) error_msg = ("Command %s terminal state %s != desired state %s" % (cmd_id, CmdStatus.Type.Name(ret.cmd_status.state), CmdStatus.Type.Name(desired_state))) raise CurieTestException(error_msg) return None
def snapshot_vms(self, vms, snapshot_names, snapshot_descriptions=(), max_parallel_tasks=None): """ For each VM with name 'vm_names[xx]' on the cluster 'vim_cluster', creates a snapshot with snapshot name 'snapshot_names[xx]' and optional description 'snapshot_descriptions[xx]'. Args vms list of CurieVMs: List of VMs to create snapshots for. snapshot_names list of strings: Names for snapshot which must be the same length as 'vms'. snapshot_descriptions List of strings: List of descriptions for each snapshot corresponding to 'vms' and 'snapshot_names'. If provided it must be the same length as 'vms'. max_parallel_tasks int: The number of VMs to power on in parallel. The default value is FLAGS.vsphere_vcenter_max_parallel_tasks. """ max_parallel_tasks = self._get_max_parallel_tasks(max_parallel_tasks) CHECK_EQ(len(vms), len(snapshot_names)) CHECK( len(snapshot_descriptions) == 0 or len(snapshot_descriptions) == len(snapshot_names)) with self._open_vcenter_connection() as vcenter: vim_cluster = self._lookup_vim_cluster(vcenter) vcenter.snapshot_vms(vim_cluster, [vm.vm_name() for vm in vms], snapshot_names, snapshot_descriptions=snapshot_descriptions, max_parallel_tasks=max_parallel_tasks)
def __init__(self, cluster_metadata): super(VsphereCluster, self).__init__(cluster_metadata) CHECK(cluster_metadata.cluster_hypervisor_info.HasField("esx_info")) CHECK( cluster_metadata.cluster_management_server_info.HasField( "vcenter_info")) # vCenter information for the vCenter server that manages this cluster. self._vcenter_info = \ self._metadata.cluster_management_server_info.vcenter_info # Datacenter name of the datacenter where the cluster to run on lives. self.__datacenter_name = self._vcenter_info.vcenter_datacenter_name # Cluster name of the cluster to run on. self.__cluster_name = self._vcenter_info.vcenter_cluster_name # Datastore name of the datastore to deploy test VMs on the cluster. self.__datastore_name = self._vcenter_info.vcenter_datastore_name
def run(self): "Run the Flask server forever." CHECK(self.__initialized) if self.__agent_uid != 0: log.warning("curie_unix_agent not running as root") log.info("Running agent on TCP port %d", FLAGS.curie_agent_port) self.__app.run(debug=FLAGS.curie_agent_flask_debug_enabled, host="0.0.0.0", port=FLAGS.curie_agent_port, threaded=True)
def _initialize(self, root_tree, xpath): """ Updates root tree and element xpath. Handles updates as appropriate after resolving changes to the tree due to class inheritance. """ CHECK(not self.__is_initialized, "Cannot reinitialize this descriptor") self.__is_initialized = True self.__root_tree = root_tree self.__xpath = xpath
def cmd_execute_sync(self, arg, timeout_secs, include_output=False): """ Given the CmdExecute request 'arg', simulate synchronous execution by sending a CmdExecute RPC to start the command, polling until the command reaches a terminal state, then returning an (exit_status, stdout, stderr) tuple for the command. 'include_output' specifies whether stdout and stderr should be returned or not (both are None if this is set to False). """ curie_ex = None try: t1 = time.time() # Send a CmdExecute RPC to start the command. execute_ret, execute_err = self.CmdExecute(arg) if execute_err is not None: raise CurieException(execute_err.error_codes[0], execute_err.error_msgs[0]) status_arg = charon_agent_interface_pb2.CmdStatusArg() status_arg.cmd_id = arg.cmd_id status_arg.include_output = include_output status_ret = None # Poll until the command reaches a terminal state or it times out. while True: status_ret, status_err = self.CmdStatus(status_arg) if status_err is not None: raise CurieException(status_err.error_codes[0], status_err.error_msgs[0]) if (status_ret.cmd_status.state != charon_agent_interface_pb2.CmdStatus.kRunning): # Command is in a terminal state. break t2 = time.time() if (t2 - t1) > timeout_secs: raise CurieException( CurieError.kTimeout, "Timeout waiting for command %s" % arg.cmd_id) time.sleep(1) # Check that we have the exit status for the command. CHECK(status_ret is not None) if not status_ret.cmd_status.HasField("exit_status"): raise CurieException( CurieError.kInternalError, "Missing exit status for command %s" % arg.cmd_id) # Return an (exit_status, stdout, stderr) tuple for the command. exit_status = status_ret.cmd_status.exit_status stdout = status_ret.stdout if status_ret.HasField( "stdout") else None stderr = status_ret.stderr if status_ret.HasField( "stderr") else None return (exit_status, stdout, stderr) except CurieException, ex: curie_ex = ex raise
def __init__(self, node, rest_api_timeout_secs=60): # Full cluster metadata proto. self.__cluster_metadata = node.cluster().metadata() # Node object for which this util is used. self.__node = node software_info = self.__cluster_metadata.cluster_software_info CHECK(software_info.HasField("nutanix_info")) # NutanixRestApiClient instance to use. self.__api = NutanixRestApiClient.from_proto( software_info.nutanix_info, timeout_secs=rest_api_timeout_secs)
def __maybe_finalize_cmd_state(self, cmd_state, exit_status=None): """ Finalize the command state in memory/disk if the command isn't already in a terminal state. An 'exit_status' value of -1 indicates the command was stopped. An 'exit_status' value of -2 indicates that the command had a non-normal exit (e.g. was terminated by a signal), which we just classify as failed. Assumes self.__lock is held. """ status_path = os.path.join(self.cmd_dir(cmd_state.cmd_id), "status.bin") cmd_status = CmdStatus() cmd_status.ParseFromString(open(status_path).read()) if cmd_status.state != CmdStatus.kRunning: # Command may have already been stopped. return log.info("Finalizing status for command %s: exit_status %s", cmd_state.cmd_id, exit_status) CHECK(cmd_status.HasField("pid"), msg=cmd_state.cmd_id) CHECK(not cmd_status.HasField("exit_status"), msg=cmd_state.cmd_id) if exit_status is not None: if exit_status == -1: cmd_status.state = CmdStatus.kStopped elif exit_status == 0: cmd_status.state = CmdStatus.kSucceeded else: cmd_status.state = CmdStatus.kFailed cmd_status.exit_status = exit_status else: cmd_status.state = CmdStatus.kUnknown cmd_status.ClearField("pid") status_data = cmd_status.SerializeToString() OsUtil.write_and_rename(status_path, status_data) cmd_state.proc = None cmd_state.pid = None
def fetch_cmd_status(self, cmd_id): """Fetch the status for cmd_id. Returns: (ret, err): where ret: CmdStatusRet filled out by rpc call to agent. err: None if no error or CurieError if there was an error. Raises: CurieException may be raised by rpc_client.CmdStatus(). """ arg = charon_agent_interface_pb2.CmdStatusArg() arg.cmd_id = cmd_id ret, err = self.CmdStatus(arg) CHECK(ret.cmd_status) return ret, err
def _set_encryption_key(cls, key): """ Associates 'key' with this message class. To prevent data loss, the key may currently be initialized only once. Args: key (str): Encryption key to use, assumed to be a SHA-256 digest. Raises: FATAL on attempting to overwrite an existing key with a new value. """ curr_key = getattr(_get_top_level_type_desc(cls), "_ENCRYPTION_KEY", None) if curr_key and curr_key == key: return CHECK(curr_key is None, msg="Encryption key has already been set %s %s" % (curr_key, key)) setattr(_get_top_level_type_desc(cls), "_ENCRYPTION_KEY", key)
def __init__(self, cluster, node_id, node_index): # Cluster this node is part of. self._cluster = cluster # Node ID. This node ID need not be unique across all nodes managed by a # given management server. However, it must be unique within the set of # nodes for the cluster. self._node_id = node_id CHECK(self._node_id) # Node index (corresponds to the index of this node within the node # # metadata vector) self._node_index = node_index CHECK_GE(self._node_index, 0, "Node index must be a natural number") # IP address currently associated with this node. self._node_ip = None self.__node_util = get_node_management_util(cluster.metadata())(self) self.power_management_util = get_power_management_util( self.metadata().node_out_of_band_management_info)
def check_nodes_ready(self, nodes, sync_with_oob=True): """ Performs minimal checks to see if nodes are in a functioning state. Specifically: -- All nodes in 'nodes' report ready via their 'is_ready' method. -- Subclasses may extend this method to provide additional cluster-level checks specific to a given cluster type. Args: nodes (list<Node>): List of nodes to check. sync_with_oob (bool): Optional. If True, ensure host power states are consistent as reported by OOB and management software prior to performing subsequent checks. Returns: (bool) True if all nodes are ready, else False. """ nodes_not_in_cluster = set(nodes).difference(self.nodes()) CHECK( len(nodes_not_in_cluster) == 0, "%s are not members of %s" % ([node.node_id() for node in nodes_not_in_cluster], self._name)) return len( self.get_unready_nodes(nodes=nodes, sync_with_oob=sync_with_oob)) == 0
def update_metadata(self, include_reporting_fields): cluster_json = self.__lookup_cluster_json() self._node_id_metadata_map = { node.id: node for node in self._metadata.cluster_nodes } node_uuid_metadata_id_map = self.get_node_uuid_metadata_id_map() for node_json in self._prism_client.hosts_get().get("entities", []): if node_json["clusterUuid"] != cluster_json["clusterUuid"]: continue try: curr_node_identifier = node_uuid_metadata_id_map[ node_json["uuid"]] except KeyError: # If the node is missing in the metadata, skip it. continue node_proto = self._node_id_metadata_map.get(curr_node_identifier) CHECK(node_proto) node_proto.id = node_json["uuid"] if include_reporting_fields: node_hw = node_proto.node_hardware node_hw.num_cpu_packages = node_json["numCpuSockets"] node_hw.num_cpu_cores = node_json["numCpuCores"] node_hw.num_cpu_threads = node_json["numCpuThreads"] node_hw.cpu_hz = node_json["cpuFrequencyInHz"] node_hw.memory_size = node_json["memoryCapacityInBytes"] if include_reporting_fields: # TODO (jklein): AHV info per-node. cluster_software_info = self._metadata.cluster_software_info nutanix_version = self._prism_client.get_nutanix_metadata().version if nutanix_version is not None: cluster_software_info.nutanix_info.version = nutanix_version
def prereq_runtime_vm_storage_is_ready(cluster): """ Confirms that curie test VM storage on each node in 'cluster' is available. Raises: CurieTestException if curie test VM storage is unavailable on any node. """ metadata = cluster.metadata() if metadata.cluster_hypervisor_info.HasField("esx_info"): num_nodes = len(metadata.cluster_nodes) CHECK( metadata.cluster_management_server_info.HasField( "vcenter_info")) vcenter_info = metadata.cluster_management_server_info.vcenter_info datastore_name = vcenter_info.vcenter_datastore_name # Check that the datastore is visible on all nodes in vCenter. log.info( "Checking that datastore %s is visible on all %s nodes in " "vCenter", datastore_name, cluster.name()) if not cluster.datastore_visible(datastore_name): raise CurieTestException( "Datastore %s not visible on all %s nodes " "in vCenter" % (datastore_name, cluster.name())) log.info("Datastore %s is visible on all %s nodes in vCenter", datastore_name, cluster.name()) cluster_software_info = metadata.cluster_software_info if cluster_software_info.HasField("nutanix_info"): # On a Nutanix cluster, check that the datastore is also visible on all # nodes in Prism. log.info( "Checking that datastore %s is visible by Prism on all %s " "nodes", datastore_name, cluster.name()) client = NutanixRestApiClient.from_proto( cluster_software_info.nutanix_info) host_id_datastore_map = {} for item in client.datastores_get(): host_id_datastore_map.setdefault(item["hostId"], set()) host_id_datastore_map[item["hostId"]].add( item["datastoreName"]) CHECK_LE(len(host_id_datastore_map), num_nodes) for host_id in host_id_datastore_map: if datastore_name not in host_id_datastore_map[host_id]: raise CurieTestException( "Datastore %s not visible by Prism on %s node %s" % (datastore_name, cluster.name(), host_id)) log.info("Datastore %s is visible by Prism on all %s nodes", datastore_name, cluster.name()) elif cluster_software_info.HasField("vsan_info"): pass elif cluster_software_info.HasField("generic_info"): pass else: raise ValueError("Unknown cluster software info, metadata %s" % metadata) elif metadata.cluster_hypervisor_info.HasField("hyperv_info"): # TODO (bferlic): More thorough checking here? return True elif metadata.cluster_hypervisor_info.HasField("ahv_info"): # TODO (jklein): More thorough checking here? return True else: raise ValueError("Unknown hypervisor type, metadata %s" % metadata)
def prereq_runtime_vm_storage_is_ready_fix(cluster): """ Attempt to make curie test VM storage available on all nodes. Raises: CurieTestException on error or timeout. """ metadata = cluster.metadata() if metadata.cluster_hypervisor_info.HasField("esx_info"): CHECK( metadata.cluster_management_server_info.HasField( "vcenter_info")) vcenter_info = metadata.cluster_management_server_info.vcenter_info datastore_name = vcenter_info.vcenter_datastore_name def datastore_visible(): try: ScenarioUtil.prereq_runtime_vm_storage_is_ready(cluster) return True except CurieTestException: pass msg = "datastore %s visible on all %s nodes" % \ (datastore_name, cluster.name()) # Refresh datastores state on all nodes to try and make the datastore # visible from vCenter's perspective. log.info("Refreshing datastores on all %s nodes", cluster.name()) cluster.refresh_datastores() if CurieUtil.wait_for(datastore_visible, msg, 60): return cluster_software_info = metadata.cluster_software_info if cluster_software_info.HasField("nutanix_info"): client = NutanixRestApiClient.from_proto( cluster_software_info.nutanix_info) container_name = None for item in client.datastores_get(): if item["datastoreName"] == datastore_name: container_name = item["containerName"] break if container_name is None: log.warning( "Datastore %s not mounted on any %s nodes, assuming " "container name is the same as the desired datastore " "name", datastore_name, cluster.name()) # Assume that the desired datastore has the same name as an existing # container name. container_name = datastore_name # Remount the datastore to try and make the datastore visible. log.info( "Unmounting and mounting datastore %s (container %s) on %s", datastore_name, container_name, cluster.name()) try: client.datastores_delete(datastore_name, verify=True) except CurieException, ex: if ex.error_code != CurieError.kInvalidParameter: raise # If Prism views the datastore as unmounted, kInvalidParameter is # returned so continue to try and mount the datastore on all nodes. client.datastores_create(container_name, datastore_name=datastore_name) cluster.refresh_datastores() if not CurieUtil.wait_for(datastore_visible, msg, 60): raise CurieTestException( "Timeout waiting for datastore %s for " "VM storage to become visible on %s" % (datastore_name, cluster.name())) elif cluster_software_info.HasField("vsan_info"): raise CurieTestException( "VSAN datastore %s not mounted on all %s " "nodes" % (datastore_name, cluster.name())) elif cluster_software_info.HasField("generic_info"): raise CurieTestException( "Datastore %s not mounted on all %s nodes" % (datastore_name, cluster.name())) else: raise ValueError("Unknown cluster software info, metadata %s" % metadata)
def __init__(self, name=__name__): CHECK(FLAGS.curie_agent_dir, "--curie_agent_dir must be set") # If requested, monkey-patch werkzeug to suppress Flask logspam. if FLAGS.curie_agent_flask_suppress_werkzeug_logs: log.debug( "--curie_agent_flask_suppress_werkzeug_logs is set. Monkey-patching " "'werkzeug._internal._log' to suppress unwanted output") CurieUtil.monkey_patch_werkzeug_logger() # Flask application. self.__app = flask.Flask(name) # RPC server. self.__rpc_server = RpcServer( charon_agent_interface_pb2.CurieAgentRpcSvc) # UID of the user running the agent. self.__agent_uid = os.getuid() # Garbage directory. self.__garbage_dir = os.path.join(FLAGS.curie_agent_dir, "garbage") # Whether we're done with initialization or not. self.__initialized = False # Thread that periodically updates the status of commands in # self.__cmd_map. self.__cmd_poller_thread = None # Thread that periodically garbage collects state for commands that can be # removed. self.__cmd_gc_thread = None # Lock that protects the fields below and all non stdout/stderr state for # all commands on disk. self.__lock = threading.Lock() # Mapping from command ID to command state. self.__cmd_map = {} #-------------------------------------------------------------------------- # # Handlers for web pages. # #-------------------------------------------------------------------------- @self.__app.route("/") def web_index(): return "" #-------------------------------------------------------------------------- # # RPC endpoint. # #-------------------------------------------------------------------------- @self.__app.route("/rpc", methods=["POST"]) @self.__rpc_server.endpoint def rpc_endpoint(): return flask.request #-------------------------------------------------------------------------- # # RPC handlers. # #-------------------------------------------------------------------------- @self.__rpc_server.handler("CmdExecute") @curie_unix_agent_api_handler def api_cmd_execute(arg): return self.__api_cmd_execute(arg) @self.__rpc_server.handler("CmdStatus") @curie_unix_agent_api_handler def api_cmd_status(arg): return self.__api_cmd_status(arg) @self.__rpc_server.handler("CmdStop") @curie_unix_agent_api_handler def api_cmd_stop(arg): return self.__api_cmd_stop(arg) @self.__rpc_server.handler("CmdRemove") @curie_unix_agent_api_handler def api_cmd_remove(arg): return self.__api_cmd_remove(arg) @self.__rpc_server.handler("CmdList") @curie_unix_agent_api_handler def api_cmd_list(arg): return self.__api_cmd_list(arg) @self.__rpc_server.handler("FileGet") @curie_unix_agent_api_handler def api_file_get(arg): return self.__api_file_get(arg)