def NodeAdd(node, readd=False, group=None): if not readd and node.added: raise qa_error.Error("Node %s already in cluster" % node.primary) elif readd and not node.added: raise qa_error.Error("Node %s not yet in cluster" % node.primary) cmd = ["gnt-node", "add", "--no-ssh-key-check"] if node.secondary: cmd.append("--secondary-ip=%s" % node.secondary) if readd: cmd.append("--readd") if group is not None: cmd.extend(["--node-group", group]) if not qa_config.GetModifySshSetup(): cmd.append("--no-node-setup") cmd.append(node.primary) AssertCommand(cmd) if readd: AssertRedirectedCommand(["gnt-cluster", "verify"]) if readd: assert node.added else: node.MarkAdded()
def _RetrieveTerminationInfo(job_id): """ Retrieves the termination info from a job caused by gnt-debug delay. @rtype: dict or None @return: The termination log entry, or None if no entry was found """ job_info = GetObjectInfo(["gnt-job", "info", str(job_id)]) opcodes = job_info[0]["Opcodes"] if not opcodes: raise qa_error.Error("Cannot retrieve a list of opcodes") execution_logs = opcodes[0]["Execution log"] if not execution_logs: return None is_termination_info_fn = \ lambda e: e["Content"][1] == constants.ELOG_DELAY_TEST filtered_logs = [l for l in execution_logs if is_termination_info(l)] no_logs = len(filtered_logs) if no_logs > 1: raise qa_error.Error( "Too many interruption information entries found!") elif no_logs == 1: return filtered_logs[0] else: return None
def IsExclusiveStorageInstanceTestEnabled(): test_name = "exclusive-storage-instance-tests" if qa_config.TestEnabled(test_name): vgname = qa_config.get("vg-name", constants.DEFAULT_VG) vgscmd = utils.ShellQuoteArgs([ "vgs", "--noheadings", "-o", "pv_count", vgname, ]) nodes = qa_config.GetConfig()["nodes"] for node in nodes: try: pvnum = int(qa_utils.GetCommandOutput(node.primary, vgscmd)) except Exception as e: msg = ( "Cannot get the number of PVs on %s, needed by '%s': %s" % (node.primary, test_name, e)) raise qa_error.Error(msg) if pvnum < 2: raise qa_error.Error( "Node %s has not enough PVs (%s) to run '%s'" % (node.primary, pvnum, test_name)) res = True else: res = False return res
def _AssertRetCode(rcode, fail, cmdstr, nodename): """Check the return value from a command and possibly raise an exception. """ if fail and rcode == 0: raise qa_error.Error("Command '%s' on node %s was expected to fail but" " didn't" % (cmdstr, nodename)) elif not fail and rcode != 0: raise qa_error.Error("Command '%s' on node %s failed, exit code %s" % (cmdstr, nodename, rcode))
def TestJobCancellation(): """gnt-job cancel""" # The delay used for the first command should be large enough for the next # command and the cancellation command to complete before the first job is # done. The second delay should be small enough that not too much time is # spend waiting in the case of a failed cancel and a running command. FIRST_COMMAND_DELAY = 10.0 AssertCommand(["gnt-debug", "delay", "--submit", str(FIRST_COMMAND_DELAY)]) SECOND_COMMAND_DELAY = 3.0 master = qa_config.GetMasterNode() # Forcing tty usage does not work on buildbot, so force all output of this # command to be redirected to stdout job_id_output = GetCommandOutput( master.primary, "gnt-debug delay --submit %s 2>&1" % SECOND_COMMAND_DELAY) possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output) if len(possible_job_ids) != 1: raise qa_error.Error( "Cannot parse gnt-debug delay output to find job id") job_id = possible_job_ids[0] AssertCommand(["gnt-job", "cancel", job_id]) # Now wait until the second job finishes, and expect the watch to fail due to # job cancellation AssertCommand(["gnt-job", "watch", job_id], fail=True) # Then check for job cancellation job_status = qa_job_utils.GetJobStatus(job_id) if job_status != constants.JOB_STATUS_CANCELED: # Try and see if the job is being cancelled, and wait until the status # changes or we hit a timeout if job_status == constants.JOB_STATUS_CANCELING: retry_fn = functools.partial(qa_job_utils.RetryingWhileJobStatus, constants.JOB_STATUS_CANCELING, job_id) try: # The multiplier to use is arbitrary, setting it higher could prevent # flakiness WAIT_MULTIPLIER = 4.0 job_status = retry.Retry(retry_fn, 2.0, WAIT_MULTIPLIER * FIRST_COMMAND_DELAY) except retry.RetryTimeout: # The job status remains the same pass if job_status != constants.JOB_STATUS_CANCELED: raise qa_error.Error("Job was not successfully cancelled, status " "found: %s" % job_status)
def Validate(self): """Validates loaded configuration data. """ if not self.get("name"): raise qa_error.Error("Cluster name is required") if not self.get("nodes"): raise qa_error.Error("Need at least one node") if not self.get("instances"): raise qa_error.Error("Need at least one instance") disks = self.GetDiskOptions() if disks is None: raise qa_error.Error("Config option 'disks' must exist") else: for d in disks: if d.get("size") is None or d.get("growth") is None: raise qa_error.Error( "Config options `size` and `growth` must exist" " for all `disks` items") check = self.GetInstanceCheckScript() if check: try: os.stat(check) except EnvironmentError as err: raise qa_error.Error( "Can't find instance check script '%s': %s" % (check, err)) enabled_hv = frozenset(self.GetEnabledHypervisors()) if not enabled_hv: raise qa_error.Error("No hypervisor is enabled") difference = enabled_hv - constants.HYPER_TYPES if difference: raise qa_error.Error("Unknown hypervisor(s) enabled: %s" % utils.CommaJoin(difference)) (vc_master, vc_basedir) = self.GetVclusterSettings() if bool(vc_master) != bool(vc_basedir): raise qa_error.Error( "All or none of the config options '%s' and '%s'" " must be set" % (_VCLUSTER_MASTER_KEY, _VCLUSTER_BASEDIR_KEY)) if vc_basedir and not utils.IsNormAbsPath(vc_basedir): raise qa_error.Error( "Path given in option '%s' must be absolute and" " normalized" % _VCLUSTER_BASEDIR_KEY)
def GetDiskOptions(self): """Return options for the disks of the instances. Get 'disks' parameter from the configuration data. If 'disks' is missing, try to create it from the legacy 'disk' and 'disk-growth' parameters. """ try: return self._data["disks"] except KeyError: pass # Legacy interface sizes = self._data.get("disk") growths = self._data.get("disk-growth") if sizes or growths: if (sizes is None or growths is None or len(sizes) != len(growths)): raise qa_error.Error( "Config options 'disk' and 'disk-growth' must" " exist and have the same number of items") disks = [] for (size, growth) in zip(sizes, growths): disks.append({"size": size, "growth": growth}) return disks else: return None
def Load(cls, filename): """Loads a configuration file and produces a configuration object. @type filename: string @param filename: Path to configuration file @rtype: L{_QaConfig} """ data = serializer.LoadJson(utils.ReadFile(filename)) # Patch the document using JSON Patch (RFC6902) in file _PATCH_JSON, if # available try: patches = _QaConfig.LoadPatches() # Try to use the module only if there is a non-empty patch present if any(patches.values()): mod = __import__("jsonpatch", fromlist=[]) _QaConfig.ApplyPatches(data, mod, patches) except IOError: pass except ImportError: raise qa_error.Error( "For the QA JSON patching feature to work, you " "need to install Python modules 'jsonpatch' and " "'jsonpointer'.") result = cls(dict(map(_ConvertResources, data.items()))) # pylint: disable=E1103 result.Validate() return result
def _StartDelayFunction(locks, timeout): """ Starts the gnt-debug delay option with the given locks and timeout. """ # The interruptible switch must be used cmd = ["gnt-debug", "delay", "-i", "--submit", "--no-master"] for node in locks.get(locking.LEVEL_NODE, []): cmd.append("-n%s" % node) cmd.append(str(timeout)) job_id = ExecuteJobProducingCommand(cmd) # Waits until a non-empty result is returned from the function log_entry = retry.SimpleRetry(lambda x: x, _RetrieveTerminationInfo, 2.0, 10.0, args=[job_id]) if not log_entry: raise qa_error.Error( "Failure when trying to retrieve delay termination " "information") _, _, (socket_path, ) = log_entry["Content"] return socket_path
def _GetBlockingLocks(): """ Finds out which locks are blocking jobs by invoking "gnt-debug locks". @rtype: list of string @return: The names of the locks currently blocking any job. """ # Due to mysterious issues when a SSH multiplexer is being used by two # threads, we turn it off, and block most of the logging to improve the # visibility of the other thread's output locks_output = GetOutputFromMaster("gnt-debug locks", use_multiplexer=False, log_cmd=False) # The first non-empty line is the header, which we do not need lock_lines = locks_output.splitlines()[1:] blocking_locks = [] for lock_line in lock_lines: components = lock_line.split() if len(components) != 4: raise qa_error.Error("Error while parsing gnt-debug locks output, " "line at fault is: %s" % lock_line) lock_name, _, _, pending_jobs = components if pending_jobs != '-': blocking_locks.append(lock_name) return blocking_locks
def TestInstanceConsecutiveFailures(instance): """Test five consecutive instance failures. """ inst_name = qa_utils.ResolveInstanceName(instance.name) inst_was_running = bool(_InstanceRunning(inst_name)) _ResetWatcherDaemon() for should_start in ([True] * 5) + [False]: _ShutdownInstance(inst_name) RunWatcherDaemon() time.sleep(5) if bool(_InstanceRunning(inst_name)) != should_start: if should_start: msg = "Instance not started when it should" else: msg = "Instance started when it shouldn't" raise qa_error.Error(msg) AssertCommand(["gnt-instance", "info", inst_name]) if inst_was_running: _StartInstance(inst_name)
def GetGenericAddParameters(inst, disk_template, force_mac=None): params = ["-B"] params.append("%s=%s,%s=%s" % (constants.BE_MINMEM, qa_config.get(constants.BE_MINMEM), constants.BE_MAXMEM, qa_config.get(constants.BE_MAXMEM))) if disk_template != constants.DT_DISKLESS: for idx, disk in enumerate(qa_config.GetDiskOptions()): size = disk.get("size") name = disk.get("name") diskparams = "%s:size=%s" % (idx, size) if name: diskparams += ",name=%s" % name if qa_config.AreSpindlesSupported(): spindles = disk.get("spindles") if spindles is None: raise qa_error.Error("'spindles' is a required parameter for disks" " when you enable exclusive storage tests") diskparams += ",spindles=%s" % spindles params.extend(["--disk", diskparams]) # Set static MAC address if configured if force_mac: nic0_mac = force_mac else: nic0_mac = inst.GetNicMacAddr(0, None) if nic0_mac: params.extend(["--net", "0:mac=%s" % nic0_mac]) return params
def _RetrieveSecret(instance, pnode): """Retrieves the DRBD secret given an instance object and the primary node. @type instance: L{qa_config._QaInstance} @type pnode: L{qa_config._QaNode} @rtype: string """ instance_info = GetInstanceInfo(instance.name) # We are interested in only the first disk on the primary drbd_minor = instance_info["drbd-minors"][pnode.primary][0] # This form should work for all DRBD versions drbd_command = ("drbdsetup show %d; drbdsetup %d show || true" % (drbd_minor, drbd_minor)) instance_drbd_info = \ qa_utils.GetCommandOutput(pnode.primary, drbd_command) match_obj = _DRBD_SECRET_RE.search(instance_drbd_info) if match_obj is None: raise qa_error.Error( "Could not retrieve DRBD secret for instance %s from" " node %s." % (instance.name, pnode.primary)) return match_obj.groups(0)[0]
def _RaiseWithInfo(msg, error_desc): """Raises a QA error with the given content, and adds a message if present. """ if msg: output = "%s: %s" % (msg, error_desc) else: output = error_desc raise qa_error.Error(output)
def TestRapiStoppedInstanceConsole(instance): """Test getting stopped instance's console information via RAPI""" try: _rapi_client.GetInstanceConsole(instance.name) except rapi.client.GanetiApiError as err: AssertEqual(err.code, 503) else: raise qa_error.Error("Getting console for stopped instance didn't" " return HTTP 503")
def TestRapiInstanceMultiAlloc(node): """Test adding two new instances via the RAPI instance-multi-alloc method""" if not qa_config.IsTemplateSupported(constants.DT_PLAIN): return JOBS_KEY = "jobs" instance_one = qa_config.AcquireInstance() instance_two = qa_config.AcquireInstance() instance_list = [instance_one, instance_two] try: rapi_dicts = [ _GenInstanceAllocationDict(node, i) for i in instance_list ] job_id = _rapi_client.InstancesMultiAlloc(rapi_dicts) results, = _WaitForRapiJob(job_id) if JOBS_KEY not in results: raise qa_error.Error("RAPI instance-multi-alloc did not deliver " "information about created jobs") if len(results[JOBS_KEY]) != len(instance_list): raise qa_error.Error( "RAPI instance-multi-alloc failed to return the " "desired number of jobs!") for success, job in results[JOBS_KEY]: if success: _WaitForRapiJob(job) else: raise qa_error.Error("Failed to create instance in " "instance-multi-alloc call") except: # Note that although released, it may be that some of the instance creations # have in fact succeeded. Handling this in a better way may be possible, but # is not necessary as the QA has already failed at this point. for instance in instance_list: instance.Release() raise return (instance_one, instance_two)
def _ShutdownInstance(name): """Shuts down instance without recording state and waits for completion. @param name: full name of the instance """ AssertCommand(["gnt-instance", "shutdown", "--no-remember", name]) if _InstanceRunning(name): raise qa_error.Error("instance shutdown failed")
def _StartInstance(name): """Starts instance and waits for completion. @param name: full name of the instance """ AssertCommand(["gnt-instance", "start", name]) if not bool(_InstanceRunning(name)): raise qa_error.Error("instance start failed")
def WaitForCompletion(self): """Wait for the completion of all registered jobs. """ while self._HasPendingJobs(): time.sleep(2) with self._lock: if self._jobs: raise qa_error.Error( "Jobs %s didn't finish in success state!" % self._GetJobIds())
def ReloadCertificates(ensure_presence=True): """Reloads the client RAPI certificate with the one present on the node. If the QA is set up to use a specific certificate using the "rapi-files-location" parameter, it will be put in place prior to retrieving it. """ if ensure_presence: _EnsureRapiFilesPresence() if _rapi_username is None or _rapi_password is None: raise qa_error.Error("RAPI username and password have to be set before" " attempting to reload a certificate.") # pylint: disable=W0603 # due to global usage global _rapi_ca global _rapi_client master = qa_config.GetMasterNode() # Load RAPI certificate from master node cmd = [ "openssl", "x509", "-in", qa_utils.MakeNodePath(master, pathutils.RAPI_CERT_FILE) ] # Write to temporary file _rapi_ca = tempfile.NamedTemporaryFile(mode="w") _rapi_ca.write( qa_utils.GetCommandOutput(master.primary, utils.ShellQuoteArgs(cmd))) _rapi_ca.flush() port = qa_config.get("rapi-port", default=constants.DEFAULT_RAPI_PORT) cfg_curl = rapi.client.GenericCurlConfig(cafile=_rapi_ca.name, proxy="") if qa_config.UseVirtualCluster(): # TODO: Implement full support for RAPI on virtual clusters print( qa_logging.FormatWarning("RAPI tests are not yet supported on" " virtual clusters and will be disabled")) assert _rapi_client is None else: _rapi_client = rapi.client.GanetiRapiClient(master.primary, port=port, username=_rapi_username, password=_rapi_password, curl_config_fn=cfg_curl) print("RAPI protocol version: %s" % _rapi_client.GetVersion())
def _ReadRapiSecret(password_file_path): """Reads a RAPI secret stored locally. @type password_file_path: string @return: Login secret for the user """ try: with open(password_file_path, 'r') as pw_file: return pw_file.readline().strip() except IOError: raise qa_error.Error("Could not open the RAPI password file located at" " %s" % password_file_path)
def TestInstanceDataCensorship(instance, inodes): """Test protection of sensitive instance data.""" if instance.disk_template != constants.DT_DRBD8: print( qa_utils.FormatInfo("Only the DRBD secret is a sensitive parameter" " right now, skipping for non-DRBD instance.")) return drbd_secret = _RetrieveSecret(instance, inodes[0]) job_id = _rapi_client.GetInstanceInfo(instance.name) if not _rapi_client.WaitForJobCompletion(job_id): raise qa_error.Error("Could not fetch instance info for instance %s" % instance.name) info_dict = _rapi_client.GetJobStatus(job_id) if drbd_secret in str(info_dict): print(qa_utils.FormatInfo("DRBD secret: %s" % drbd_secret)) print(qa_utils.FormatInfo("Retrieved data\n%s" % str(info_dict))) raise qa_error.Error( "Found DRBD secret in contents of RAPI instance info" " call; see above.")
def _LookupRapiSecret(rapi_user): """Find the RAPI secret for the given user on the QA machines. @param rapi_user: Login user @return: Login secret for the user """ CTEXT = "{CLEARTEXT}" master = qa_config.GetMasterNode() cmd = ["cat", qa_utils.MakeNodePath(master, pathutils.RAPI_USERS_FILE)] file_content = qa_utils.GetCommandOutput(master.primary, utils.ShellQuoteArgs(cmd)) users = ParsePasswordFile(file_content) entry = users.get(rapi_user) if not entry: raise qa_error.Error("User %s not found in RAPI users file" % rapi_user) secret = entry.password if secret.upper().startswith(CTEXT): secret = secret[len(CTEXT):] elif secret.startswith("{"): raise qa_error.Error("Unsupported password schema for RAPI user %s:" " not a clear text password" % rapi_user) return secret
def ApplyPatches(data, patch_module, patches): """Applies any patches present, and returns the modified QA configuration. First, patches from the patch directory are applied. They are ordered alphabetically, unless there is an ``order`` file present - any patches listed within are applied in that order, and any remaining ones in alphabetical order again. Finally, the default patch residing in the top-level QA directory is applied. @type data: dict (deserialized json) @param data: The QA configuration to modify @type patch_module: module @param patch_module: The json patch module, loaded dynamically @type patches: dict of string to dict @param patches: The dictionary of patch path to content """ ordered_patches = [] order_path = os.path.join(_QA_BASE_PATH, _QA_PATCH_DIR, _QA_PATCH_ORDER_FILE) if os.path.exists(order_path): order_file = open(order_path, 'r') ordered_patches = order_file.read().splitlines() # Removes empty lines ordered_patches = [_f for _f in ordered_patches if _f] # Add the patch dir ordered_patches = [ os.path.join(_QA_PATCH_DIR, x) for x in ordered_patches ] # First the ordered patches for patch in ordered_patches: if patch not in patches: raise qa_error.Error( "Patch %s specified in the ordering file does not " "exist" % patch) _QaConfig.ApplyPatch(data, patch_module, patches, patch) # Then the other non-default ones for patch in sorted(patches): if patch != _QA_DEFAULT_PATCH and patch not in ordered_patches: _QaConfig.ApplyPatch(data, patch_module, patches, patch) # Finally the default one if _QA_DEFAULT_PATCH in patches: _QaConfig.ApplyPatch(data, patch_module, patches, _QA_DEFAULT_PATCH)
def TestInstanceAutomaticRestart(instance): """Test automatic restart of instance by ganeti-watcher. """ inst_name = qa_utils.ResolveInstanceName(instance.name) _ResetWatcherDaemon() _ShutdownInstance(inst_name) RunWatcherDaemon() time.sleep(5) if not _InstanceRunning(inst_name): raise qa_error.Error("Daemon didn't restart instance") AssertCommand(["gnt-instance", "info", inst_name])
def ExecuteJobProducingCommand(cmd): """ Executes a command that contains the --submit flag, and returns a job id. @type cmd: list of string @param cmd: The command to execute, broken into constituent components. """ job_id_output = GetOutputFromMaster(cmd) # Usually, the output contains "JobID: <job_id>", but for instance related # commands, the output is of the form "<job_id>: <instance_name>" possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output) or \ re.findall("([0-9]+): .+", job_id_output) if len(possible_job_ids) != 1: raise qa_error.Error("Cannot parse command output to find job id: output " "is %s" % job_id_output) return int(possible_job_ids[0])
def RunInstanceCheck(instance, running): """Check if instance is running or not. """ instance_name = _GetName(instance, operator.attrgetter("name")) script = qa_config.GetInstanceCheckScript() if not script: return master_node = qa_config.GetMasterNode() # Build command to connect to master node master_ssh = GetSSHCommand(master_node.primary, "--") if running: running_shellval = "1" running_text = "" else: running_shellval = "" running_text = "not " print( FormatInfo("Checking if instance '%s' is %srunning" % (instance_name, running_text))) args = [script, instance_name] env = { "PATH": constants.HOOKS_PATH, "RUN_UUID": _RUN_UUID, "MASTER_SSH": utils.ShellQuoteArgs(master_ssh), "INSTANCE_NAME": instance_name, "INSTANCE_RUNNING": running_shellval, } result = os.spawnve(os.P_WAIT, script, args, env) if result != 0: raise qa_error.Error("Instance check failed with result %s" % result)
def _UpdateJobStatuses(self): """Retrieves job statuses from the cluster and updates internal state. """ self._FetchJobStatuses() error_jobs = self._GetJobsInStatuses([constants.JOB_STATUS_ERROR]) if error_jobs: raise qa_error.Error( "Jobs %s are in error state!" % [job.job_id for job in error_jobs]) for job in self._GetJobsInStatuses([constants.JOB_STATUS_RUNNING, constants.JOB_STATUS_SUCCESS]): if job.job_id not in self._running_notified: if job.running_fn is not None: job.running_fn(self, job.job_id) self._running_notified.add(job.job_id) for job in self._GetJobsInStatuses([constants.JOB_STATUS_SUCCESS]): if job.success_fn is not None: job.success_fn(self, job.job_id) # we're done with this job del self._jobs[job.job_id]
def AssertRedirectedCommand(cmd, fail=False, node=None, log_cmd=True): """Executes a command with redirected output. The log will go to the qa-output log file in the ganeti log directory on the node where the command is executed. The fail and node parameters are passed unchanged to AssertCommand. @param cmd: the command to be executed, as a list; a string is not supported """ if not isinstance(cmd, list): raise qa_error.Error("Non-list passed to AssertRedirectedCommand") ofile = utils.ShellQuote(_QA_OUTPUT) cmdstr = utils.ShellQuoteArgs(cmd) AssertCommand("echo ---- $(date) %s ---- >> %s" % (cmdstr, ofile), fail=False, node=node, log_cmd=False) return AssertCommand(cmdstr + " >> %s" % ofile, fail=fail, node=node, log_cmd=log_cmd)
def CheckFileUnmodified(node, filename): """Checks that the content of a given file remains the same after running a wrapped code. @type node: string @param node: node the command should run on @type filename: string @param filename: absolute filename to check """ cmd = utils.ShellQuoteArgs(["sha1sum", MakeNodePath(node, filename)]) def Read(): return GetCommandOutput(node, cmd).strip() # read the configuration before = Read() yield # check that the configuration hasn't changed after = Read() if before != after: raise qa_error.Error("File '%s' has changed unexpectedly on node %s" " during the last operation" % (filename, node))