def _GetAttributeFromHypervisorNodeData(hv_info, node_name, attr): """Extract an attribute from the hypervisor's node information. This is a helper function to extract data from the hypervisor's information about the node, as part of the result of a node_info query. @type hv_info: dict of strings @param hv_info: dictionary of node information from the hypervisor @type node_name: string @param node_name: name of the node @type attr: string @param attr: key of the attribute in the hv_info dictionary @rtype: integer @return: the value of the attribute @raises errors.OpExecError: if key not in dictionary or value not integer """ if attr not in hv_info: raise errors.OpExecError("Node '%s' didn't return attribute" " '%s'" % (node_name, attr)) value = hv_info[attr] if not isinstance(value, int): raise errors.OpExecError("Node '%s' returned invalid value" " for '%s': %s" % (node_name, attr, value)) return value
def _ValidateResult(self): """Process the allocator results. This will process and if successful save the result in self.out_data and the other parameters. """ try: rdict = serializer.Load(self.out_text) except Exception as err: raise errors.OpExecError("Can't parse iallocator results: %s" % str(err)) if not isinstance(rdict, dict): raise errors.OpExecError( "Can't parse iallocator results: not a dict") # TODO: remove backwards compatiblity in later versions if "nodes" in rdict and "result" not in rdict: rdict["result"] = rdict["nodes"] del rdict["nodes"] for key in "success", "info", "result": if key not in rdict: raise errors.OpExecError("Can't parse iallocator results:" " missing key '%s'" % key) setattr(self, key, rdict[key]) self.req.ValidateResult(self, self.result) self.out_data = rdict
def Exec(self, feedback_fn): """Add the ip pool to the cluster. """ nobj = objects.Network(name=self.op.network_name, network=self.op.network, gateway=self.op.gateway, network6=self.op.network6, gateway6=self.op.gateway6, mac_prefix=self.op.mac_prefix, uuid=self.network_uuid) # Initialize the associated address pool try: pool = network.AddressPool.InitializeNetwork(nobj) except errors.AddressPoolError as err: raise errors.OpExecError( "Cannot create IP address pool for network" " '%s': %s" % (self.op.network_name, err)) # Check if we need to reserve the nodes and the cluster master IP # These may not be allocated to any instances in routed mode, as # they wouldn't function anyway. if self.op.conflicts_check: for node in self.cfg.GetAllNodesInfo().values(): for ip in [node.primary_ip, node.secondary_ip]: try: if pool.Contains(ip): pool.Reserve(ip, external=True) self.LogInfo( "Reserved IP address of node '%s' (%s)", node.name, ip) except errors.AddressPoolError as err: self.LogWarning( "Cannot reserve IP address '%s' of node '%s': %s", ip, node.name, err) master_ip = self.cfg.GetClusterInfo().master_ip try: if pool.Contains(master_ip): pool.Reserve(master_ip, external=True) self.LogInfo("Reserved cluster master IP address (%s)", master_ip) except errors.AddressPoolError as err: self.LogWarning( "Cannot reserve cluster master IP address (%s): %s", master_ip, err) if self.op.add_reserved_ips: for ip in self.op.add_reserved_ips: try: pool.Reserve(ip, external=True) except errors.AddressPoolError as err: raise errors.OpExecError( "Cannot reserve IP address '%s': %s" % (ip, err)) if self.op.tags: for tag in self.op.tags: nobj.AddTag(tag) self.cfg.AddNetwork(nobj, self.proc.GetECId(), check_uuid=False)
def _TestJobSubmission(opts): """Tests submitting jobs. """ ToStdout("Testing job submission") testdata = [ (0, 0, constants.OP_PRIO_LOWEST), (0, 0, constants.OP_PRIO_HIGHEST), ] for priority in (constants.OP_PRIO_SUBMIT_VALID | frozenset( [constants.OP_PRIO_LOWEST, constants.OP_PRIO_HIGHEST])): for offset in [-1, +1]: testdata.extend([ (0, 0, priority + offset), (3, 0, priority + offset), (0, 3, priority + offset), (4, 2, priority + offset), ]) for before, after, failpriority in testdata: ops = [] ops.extend([opcodes.OpTestDelay(duration=0) for _ in range(before)]) ops.append(opcodes.OpTestDelay(duration=0, priority=failpriority)) ops.extend([opcodes.OpTestDelay(duration=0) for _ in range(after)]) try: cl = cli.GetClient() cl.SubmitJob(ops) except errors.GenericError as err: if opts.debug: ToStdout("Ignoring error for 'wrong priority' test: %s", err) else: raise errors.OpExecError( "Submitting opcode with priority %s did not" " fail when it should (allowed are %s)" % (failpriority, constants.OP_PRIO_SUBMIT_VALID)) jobs = [ [ opcodes.OpTestDelay(duration=0), opcodes.OpTestDelay(duration=0, dry_run=False), opcodes.OpTestDelay(duration=0, dry_run=True) ], ops, ] try: cl = cli.GetClient() cl.SubmitManyJobs(jobs) except errors.GenericError as err: if opts.debug: ToStdout("Ignoring error for 'wrong priority' test: %s", err) else: raise errors.OpExecError( "Submitting manyjobs with an incorrect one" " did not fail when it should.") ToStdout("Job submission tests were successful")
def CheckPrereq(self): """Check prerequisites. """ assert self.needed_locks[locking.LEVEL_NODEGROUP] assert (frozenset(self.owned_locks(locking.LEVEL_NODE)) == frozenset( self.op.node_uuids)) expected_locks = (set([self.group_uuid]) | self.cfg.GetNodeGroupsFromNodes(self.op.node_uuids)) actual_locks = self.owned_locks(locking.LEVEL_NODEGROUP) if actual_locks != expected_locks: raise errors.OpExecError( "Nodes changed groups since locks were acquired," " current groups are '%s', used to be '%s'" % (utils.CommaJoin(expected_locks), utils.CommaJoin(actual_locks))) self.node_data = self.cfg.GetAllNodesInfo() self.group = self.cfg.GetNodeGroup(self.group_uuid) instance_data = self.cfg.GetAllInstancesInfo() if self.group is None: raise errors.OpExecError( "Could not retrieve group '%s' (UUID: %s)" % (self.op.group_name, self.group_uuid)) (new_splits, previous_splits) = \ self.CheckAssignmentForSplitInstances([(uuid, self.group_uuid) for uuid in self.op.node_uuids], self.node_data, instance_data) if new_splits: fmt_new_splits = utils.CommaJoin( utils.NiceSort(self.cfg.GetInstanceNames(new_splits))) if not self.op.force: raise errors.OpExecError( "The following instances get split by this" " change and --force was not given: %s" % fmt_new_splits) else: self.LogWarning( "This operation will split the following instances: %s", fmt_new_splits) if previous_splits: self.LogWarning( "In addition, these already-split instances continue" " to be split across groups: %s", utils.CommaJoin( utils.NiceSort( self.cfg.GetInstanceNames(previous_splits))))
def GetUserFiles(user, mkdir=False, dircheck=True, kind=constants.SSHK_DSA, _homedir_fn=None): """Return the paths of a user's SSH files. @type user: string @param user: Username @type mkdir: bool @param mkdir: Whether to create ".ssh" directory if it doesn't exist @type dircheck: bool @param dircheck: Whether to check if ".ssh" directory exists @type kind: string @param kind: One of L{constants.SSHK_ALL} @rtype: tuple; (string, string, string) @return: Tuple containing three file system paths; the private SSH key file, the public SSH key file and the user's C{authorized_keys} file @raise errors.OpExecError: When home directory of the user can not be determined @raise errors.OpExecError: Regardless of the C{mkdir} parameters, this exception is raised if C{~$user/.ssh} is not a directory and C{dircheck} is set to C{True} """ if _homedir_fn is None: _homedir_fn = utils.GetHomeDir user_dir = _homedir_fn(user) if not user_dir: raise errors.OpExecError("Cannot resolve home of user '%s'" % user) if kind == constants.SSHK_DSA: suffix = "dsa" elif kind == constants.SSHK_RSA: suffix = "rsa" elif kind == constants.SSHK_ECDSA: suffix = "ecdsa" else: raise errors.ProgrammerError("Unknown SSH key kind '%s'" % kind) ssh_dir = utils.PathJoin(user_dir, ".ssh") if mkdir: utils.EnsureDirs([(ssh_dir, constants.SECURE_DIR_MODE)]) elif dircheck and not os.path.isdir(ssh_dir): raise errors.OpExecError("Path %s is not a directory" % ssh_dir) return [ utils.PathJoin(ssh_dir, base) for base in ["id_%s" % suffix, "id_%s.pub" % suffix, "authorized_keys"] ]
def _GetNames(self, lu, all_names, lock_level): """Helper function to determine names asked for in the query. """ if self.do_locking: names = lu.owned_locks(lock_level) else: names = all_names if self.wanted == locking.ALL_SET: assert not self.names # caller didn't specify names, so ordering is not important return utils.NiceSort(names) # caller specified names and we must keep the same order assert self.names missing = set(self.wanted).difference(names) if missing: raise errors.OpExecError( "Some items were removed before retrieving" " their data: %s" % missing) # Return expanded names return self.wanted
def _LockAndExecLU(self, lu, level, calc_timeout): """Execute a Logical Unit, with the needed locks. This is a recursive function that starts locking the given level, and proceeds up, till there are no more locks to acquire. Then it executes the given LU and its opcodes. """ adding_locks = level in lu.add_locks acquiring_locks = level in lu.needed_locks if level not in locking.LEVELS: _VerifyLocks(lu) if self._cbs: self._cbs.NotifyStart() try: result = self._ExecLU(lu) except AssertionError, err: # this is a bit ugly, as we don't know from which phase # (prereq, exec) this comes; but it's better than an exception # with no information (_, _, tb) = sys.exc_info() err_info = traceback.format_tb(tb) del tb logging.exception("Detected AssertionError") raise errors.OpExecError( "Internal assertion error: please report" " this as a bug.\nError message: '%s';" " location:\n%s" % (str(err), err_info[-1]))
def _SetWatcherPause(context, ec_id, until): """Creates or removes the watcher pause file. @type context: L{GanetiContext} @param context: Global Ganeti context @type until: None or int @param until: Unix timestamp saying until when the watcher shouldn't run """ node_names = context.GetConfig(ec_id).GetNodeList() if until is None: logging.info("Received request to no longer pause watcher") else: if not ht.TNumber(until): raise TypeError("Duration must be numeric") if until < time.time(): raise errors.GenericError( "Unable to set pause end time in the past") logging.info("Received request to pause watcher until %s", until) result = context.rpc.call_set_watcher_pause(node_names, until) errmsg = utils.CommaJoin("%s (%s)" % (node_name, nres.fail_msg) for (node_name, nres) in result.items() if nres.fail_msg and not nres.offline) if errmsg: raise errors.OpExecError( "Watcher pause was set where possible, but failed" " on the following node(s): %s" % errmsg) return until
def Exec(self, feedback_fn): """Connect to the console of an instance """ node_uuid = self.instance.primary_node cluster_hvparams = self.cfg.GetClusterInfo().hvparams node_insts = self.rpc.call_instance_list([node_uuid], [self.instance.hypervisor], cluster_hvparams)[node_uuid] node_insts.Raise("Can't get node information from %s" % self.cfg.GetNodeName(node_uuid)) if self.instance.name not in node_insts.payload: if self.instance.admin_state == constants.ADMINST_UP: state = constants.INSTST_ERRORDOWN elif self.instance.admin_state == constants.ADMINST_DOWN: state = constants.INSTST_ADMINDOWN else: state = constants.INSTST_ADMINOFFLINE raise errors.OpExecError("Instance %s is not running (state %s)" % (self.instance.name, state)) logging.debug("Connecting to console of %s on %s", self.instance.name, self.cfg.GetNodeName(node_uuid)) node = self.cfg.GetNodeInfo(self.instance.primary_node) group = self.cfg.GetNodeGroup(node.group) return GetInstanceConsole(self.cfg.GetClusterInfo(), self.instance, node, group)
def ReportLogMessage(self, job_id, serial, timestamp, log_type, log_msg): """Handles a log message. """ if self._job_id is None: self._job_id = job_id elif self._job_id != job_id: raise errors.ProgrammerError("The same reporter instance was used for" " more than one job") if log_type == constants.ELOG_JQUEUE_TEST: (sockname, test, arg) = log_msg return self._ProcessTestMessage(job_id, sockname, test, arg) elif (log_type == constants.ELOG_MESSAGE and log_msg.startswith(constants.JQT_MSGPREFIX)): if self._testmsgs is None: raise errors.OpExecError("Received test message without a preceding" " start message") testmsg = log_msg[len(constants.JQT_MSGPREFIX):] self._testmsgs.append(testmsg) self._all_testmsgs.append(testmsg) return return cli.StdioJobPollReportCb.ReportLogMessage(self, job_id, serial, timestamp, log_type, log_msg)
def RunCommand(self, cluster_name, node, base_cmd, port, data, debug=False, verbose=False, use_cluster_key=False, ask_key=False, strict_host_check=False, ensure_version=False): """This emulates ssh.RunSshCmdWithStdin calling ssh_update. While in real SSH operations, ssh.RunSshCmdWithStdin is called with the command ssh_update to manipulate a remote node's SSH key files (authorized_keys and ganeti_pub_key) file, this method emulates the operation by manipulating only its internal dictionaries of SSH keys. No actual key files of any node is touched. """ if node in self._max_retries: if node not in self._retries: self._retries[node] = 0 self._retries[node] += 1 if self._retries[node] < self._max_retries[node]: raise errors.OpExecError("(Fake) SSH connection to node '%s' failed." % node) assert base_cmd == pathutils.SSH_UPDATE if constants.SSHS_SSH_AUTHORIZED_KEYS in data: instructions_auth = data[constants.SSHS_SSH_AUTHORIZED_KEYS] self._HandleAuthorizedKeys(instructions_auth, node) if constants.SSHS_SSH_PUBLIC_KEYS in data: instructions_pub = data[constants.SSHS_SSH_PUBLIC_KEYS] self._HandlePublicKeys(instructions_pub, node) if constants.SSHS_GENERATE in data: instructions_generate = data[constants.SSHS_GENERATE] self._GenerateNewKey(instructions_generate, node)
def Exec(self, feedback_fn): if self.op.osparams_secret: msg = "Secret OS parameters: %s" % self.op.osparams_secret.Unprivate( ) feedback_fn(msg) else: raise errors.OpExecError("Opcode needs secret parameters")
def _CheckPayload(self, result): """Checks if the payload is valid. @param result: RPC result @raises errors.OpExecError: If payload is not valid """ errs = [] if self.op.command == constants.OOB_HEALTH: if not isinstance(result.payload, list): errs.append("command 'health' is expected to return a list but got %s" % type(result.payload)) else: for item, status in result.payload: if status not in constants.OOB_STATUSES: errs.append("health item '%s' has invalid status '%s'" % (item, status)) if self.op.command == constants.OOB_POWER_STATUS: if not isinstance(result.payload, dict): errs.append("power-status is expected to return a dict but got %s" % type(result.payload)) if self.op.command in [ constants.OOB_POWER_ON, constants.OOB_POWER_OFF, constants.OOB_POWER_CYCLE, ]: if result.payload is not None: errs.append("%s is expected to not return payload but got '%s'" % (self.op.command, result.payload)) if errs: raise errors.OpExecError("Check of out-of-band payload failed due to %s" % utils.CommaJoin(errs))
def _ProcessTestMessage(self, job_id, sockname, test, arg): """Handles a job queue test message. """ if test not in constants.JQT_ALL: raise errors.OpExecError("Received invalid test message %s" % test) sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: sock.settimeout(30.0) logging.debug("Connecting to %s", sockname) sock.connect(sockname) logging.debug("Checking status") jobdetails = cli.GetClient().QueryJobs([job_id], ["status"])[0] if not jobdetails: raise errors.OpExecError("Can't find job %s" % job_id) status = jobdetails[0] logging.debug("Status of job %s is %s", job_id, status) if test == constants.JQT_EXPANDNAMES: if status != constants.JOB_STATUS_WAITING: raise errors.OpExecError( "Job status while expanding names is '%s'," " not '%s' as expected" % (status, constants.JOB_STATUS_WAITING)) elif test in (constants.JQT_EXEC, constants.JQT_LOGMSG): if status != constants.JOB_STATUS_RUNNING: raise errors.OpExecError( "Job status while executing opcode is '%s'," " not '%s' as expected" % (status, constants.JOB_STATUS_RUNNING)) if test == constants.JQT_STARTMSG: logging.debug("Expecting %s test messages", arg) self._testmsgs = [] elif test == constants.JQT_LOGMSG: if len(self._testmsgs) != arg: raise errors.OpExecError( "Received %s test messages when %s are" " expected" % (len(self._testmsgs), arg)) finally: logging.debug("Closing socket") sock.close()
def _DetermineImageSize(self, image_path, node_uuid): """ Determines the size of the specified image. @type image_path: string @param image_path: The disk path or a URL of an image. @type node_uuid: string @param node_uuid: If a file path is used, @raise OpExecError: If the image does not exist. @rtype: int @return: The size in MB, rounded up. """ # Check if we are dealing with a URL first class _HeadRequest(urllib2.Request): def get_method(self): return "HEAD" if utils.IsUrl(image_path): try: response = urllib2.urlopen(_HeadRequest(image_path)) except urllib2.URLError: raise errors.OpExecError( "Could not retrieve image from given url %s" % image_path) content_length_str = response.info().getheader('content-length') if not content_length_str: raise errors.OpExecError( "Cannot create temporary disk: size of zeroing image at path %s " "could not be retrieved through HEAD request" % image_path) byte_size = int(content_length_str) else: # We end up here if a file path is used result = self.rpc.call_get_file_info(node_uuid, image_path) result.Raise("Cannot determine the size of file %s" % image_path) success, attributes = result.payload if not success: raise errors.OpExecError("Could not open file %s" % image_path) byte_size = attributes[constants.STAT_SIZE] # Finally, the conversion return math.ceil(byte_size / 1024. / 1024.)
def _InitGanetiServerSetup(master_name, cfg): """Setup the necessary configuration for the initial node daemon. This creates the nodepass file containing the shared password for the cluster, generates the SSL certificate and starts the node daemon. @type master_name: str @param master_name: Name of the master node @type cfg: ConfigWriter @param cfg: the configuration writer """ # Generate cluster secrets GenerateClusterCrypto(True, False, False, False, False, False, master_name) # Add the master's SSL certificate digest to the configuration. master_uuid = cfg.GetMasterNode() master_digest = utils.GetCertificateDigest() cfg.AddNodeToCandidateCerts(master_uuid, master_digest) cfg.Update(cfg.GetClusterInfo(), logging.error) ssconf.WriteSsconfFiles(cfg.GetSsconfValues()) if not os.path.exists( os.path.join( pathutils.DATA_DIR, "%s%s" % (constants.SSCONF_FILEPREFIX, constants.SS_MASTER_CANDIDATES_CERTS))): raise errors.OpExecError( "Ssconf file for master candidate certificates" " was not written.") if not os.path.exists(pathutils.NODED_CERT_FILE): raise errors.OpExecError( "The server certficate was not created properly.") if not os.path.exists(pathutils.NODED_CLIENT_CERT_FILE): raise errors.OpExecError("The client certificate was not created" " properly.") # set up the inter-node password and certificate result = utils.RunCmd([pathutils.DAEMON_UTIL, "start", constants.NODED]) if result.failed: raise errors.OpExecError("Could not start the node daemon, command %s" " had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output)) _WaitForNodeDaemon(master_name)
def Exec(self, feedback_fn): """Sets the tag. """ try: for tag in self.op.tags: self.target.AddTag(tag) except errors.TagError, err: raise errors.OpExecError("Error while setting tag: %s" % str(err))
def Exec(self, feedback_fn): """Remove the node group. """ try: self.cfg.RemoveNodeGroup(self.group_uuid) except errors.ConfigurationError: raise errors.OpExecError("Group '%s' with UUID %s disappeared" % (self.op.group_name, self.group_uuid))
def Exec(self, feedback_fn): """Remove the network. """ try: self.cfg.RemoveNetwork(self.network_uuid) except errors.ConfigurationError: raise errors.OpExecError("Network '%s' with UUID %s disappeared" % (self.op.network_name, self.network_uuid))
def CheckPrereq(self): """Check prerequisites. """ owned_instance_names = frozenset( self.owned_locks(locking.LEVEL_INSTANCE)) # Check if locked instances are still correct CheckNodeGroupInstances(self.cfg, self.group_uuid, owned_instance_names) self.group = self.cfg.GetNodeGroup(self.group_uuid) cluster = self.cfg.GetClusterInfo() if self.group is None: raise errors.OpExecError( "Could not retrieve group '%s' (UUID: %s)" % (self.op.group_name, self.group_uuid)) if self.op.ndparams: new_ndparams = GetUpdatedParams(self.group.ndparams, self.op.ndparams) utils.ForceDictType(new_ndparams, constants.NDS_PARAMETER_TYPES) self.new_ndparams = new_ndparams if self.op.diskparams: diskparams = self.group.diskparams uavdp = self._UpdateAndVerifyDiskParams # For each disktemplate subdict update and verify the values new_diskparams = dict( (dt, uavdp(diskparams.get(dt, {}), self.op.diskparams[dt])) for dt in constants.DISK_TEMPLATES if dt in self.op.diskparams) # As we've all subdicts of diskparams ready, lets merge the actual # dict with all updated subdicts self.new_diskparams = objects.FillDict(diskparams, new_diskparams) try: utils.VerifyDictOptions(self.new_diskparams, constants.DISK_DT_DEFAULTS) CheckDiskAccessModeConsistency(self.new_diskparams, self.cfg, group=self.group) except errors.OpPrereqError as err: raise errors.OpPrereqError( "While verify diskparams options: %s" % err, errors.ECODE_INVAL) if self.op.hv_state: self.new_hv_state = MergeAndVerifyHvState( self.op.hv_state, self.group.hv_state_static) if self.op.disk_state: self.new_disk_state = \ MergeAndVerifyDiskState(self.op.disk_state, self.group.disk_state_static) self._CheckIpolicy(cluster, owned_instance_names)
def _LockAndExecLU(self, lu, level, calc_timeout, pending=None): """Execute a Logical Unit, with the needed locks. This is a recursive function that starts locking the given level, and proceeds up, till there are no more locks to acquire. Then it executes the given LU and its opcodes. """ pending = pending or [] logging.debug("Looking at locks of level %s, still need to obtain %s", level, pending) adding_locks = level in lu.add_locks acquiring_locks = level in lu.needed_locks if level not in locking.LEVELS: if pending: self._RequestAndWait(pending, calc_timeout()) lu.wconfdlocks = self.wconfd.Client().ListLocks( self._wconfdcontext) pending = [] logging.debug("Finished acquiring locks") _VerifyLocks(lu) if self._cbs: self._cbs.NotifyStart() try: result = self._ExecLU(lu) except errors.OpPrereqError, err: (_, ecode) = err.args if ecode != errors.ECODE_TEMP_NORES: raise logging.debug( "Temporarily out of resources; will retry internally") try: lu.PrepareRetry(self.Log) if self._cbs: self._cbs.NotifyRetry() except errors.OpRetryNotSupportedError: logging.debug("LU does not know how to retry.") raise err raise LockAcquireTimeout() except AssertionError, err: # this is a bit ugly, as we don't know from which phase # (prereq, exec) this comes; but it's better than an exception # with no information (_, _, tb) = sys.exc_info() err_info = traceback.format_tb(tb) del tb logging.exception("Detected AssertionError") raise errors.OpExecError( "Internal assertion error: please report" " this as a bug.\nError message: '%s';" " location:\n%s" % (str(err), err_info[-1]))
def ZeroFreeSpace(self, feedback_fn): """Zeroes the free space on a shutdown instance. @type feedback_fn: function @param feedback_fn: Function used to log progress """ assert self.op.zeroing_timeout_fixed is not None assert self.op.zeroing_timeout_per_mib is not None zeroing_image = self.cfg.GetZeroingImage() src_node_uuid = self.instance.primary_node disk_size = self._DetermineImageSize(zeroing_image, src_node_uuid) # Calculate the sum prior to adding the temporary disk instance_disks_size_sum = self._InstanceDiskSizeSum() with TemporaryDisk(self, self.instance, disk_size, feedback_fn): feedback_fn("Activating instance disks") StartInstanceDisks(self, self.instance, False) feedback_fn("Imaging disk with zeroing image") ImageDisks(self, self.instance, zeroing_image) feedback_fn("Starting instance with zeroing image") result = self.rpc.call_instance_start(src_node_uuid, (self.instance, [], []), False, self.op.reason) result.Raise( "Could not start instance %s when using the zeroing image " "%s" % (self.instance.name, zeroing_image)) # First wait for the instance to start up running_check = lambda: IsInstanceRunning( self, self.instance, check_user_shutdown=True) instance_up = retry.SimpleRetry(True, running_check, 5.0, self.op.shutdown_timeout) if not instance_up: raise errors.OpExecError( "Could not boot instance when using the " "zeroing image %s" % zeroing_image) feedback_fn("Instance is up, now awaiting shutdown") # Then for it to be finished, detected by its shutdown timeout = self.op.zeroing_timeout_fixed + \ self.op.zeroing_timeout_per_mib * instance_disks_size_sum instance_up = retry.SimpleRetry(False, running_check, 20.0, timeout) if instance_up: self.LogWarning( "Zeroing not completed prior to timeout; instance will" "be shut down forcibly") feedback_fn("Zeroing completed!")
def _ValidateResult(self): """Process the allocator results. This will process and if successful save the result in self.out_data and the other parameters. """ try: rdict = serializer.Load(self.out_text) except Exception, err: raise errors.OpExecError("Can't parse iallocator results: %s" % str(err))
def _UninterruptibleDelay(self): """Delays without allowing interruptions. """ if self.op.on_node_uuids: result = self.rpc.call_test_delay(self.op.on_node_uuids, self.op.duration) for node_uuid, node_result in result.items(): node_result.Raise("Failure during rpc call to node %s" % self.cfg.GetNodeName(node_uuid)) else: if not utils.TestDelay(self.op.duration)[0]: raise errors.OpExecError("Error during master delay test")
def _TestDelay(self): """Do the actual sleep. """ if self.op.on_master: if not utils.TestDelay(self.op.duration)[0]: raise errors.OpExecError("Error during master delay test") if self.op.on_node_uuids: result = self.rpc.call_test_delay(self.op.on_node_uuids, self.op.duration) for node_uuid, node_result in result.items(): node_result.Raise("Failure during rpc call to node %s" % self.cfg.GetNodeName(node_uuid))
def Exec(self, feedback_fn): """Rename the network. """ network = self.cfg.GetNetwork(self.network_uuid) if network is None: raise errors.OpExecError("Could not retrieve network '%s' (UUID: %s)" % (self.op.network_name, self.network_uuid)) network.name = self.op.new_name self.cfg.Update(network, feedback_fn) return self.op.new_name
def _ComputeStorageDataFromSpaceInfo(space_info, node_name, has_lvm): """Extract storage data from node info. @type space_info: see result of the RPC call node info @param space_info: the storage reporting part of the result of the RPC call node info @type node_name: string @param node_name: the node's name @type has_lvm: boolean @param has_lvm: whether or not LVM storage information is requested @rtype: 4-tuple of integers @return: tuple of storage info (total_disk, free_disk, total_spindles, free_spindles) """ # TODO: replace this with proper storage reporting if has_lvm: lvm_vg_info = utils.storage.LookupSpaceInfoByStorageType( space_info, constants.ST_LVM_VG) if not lvm_vg_info: raise errors.OpExecError( "Node '%s' didn't return LVM vg space info." % (node_name)) total_disk = lvm_vg_info["storage_size"] free_disk = lvm_vg_info["storage_free"] lvm_pv_info = utils.storage.LookupSpaceInfoByStorageType( space_info, constants.ST_LVM_PV) if not lvm_pv_info: raise errors.OpExecError( "Node '%s' didn't return LVM pv space info." % (node_name)) total_spindles = lvm_pv_info["storage_size"] free_spindles = lvm_pv_info["storage_free"] else: # we didn't even ask the node for VG status, so use zeros total_disk = free_disk = 0 total_spindles = free_spindles = 0 return (total_disk, free_disk, total_spindles, free_spindles)
def Exec(self, feedback_fn): """Rename the node group. """ group = self.cfg.GetNodeGroup(self.group_uuid) if group is None: raise errors.OpExecError( "Could not retrieve group '%s' (UUID: %s)" % (self.op.group_name, self.group_uuid)) group.name = self.op.new_name self.cfg.Update(group, feedback_fn) return self.op.new_name
def Exec(self, feedback_fn): """Reboot the instance. """ cluster = self.cfg.GetClusterInfo() remote_info = self.rpc.call_instance_info( self.instance.primary_node, self.instance.name, self.instance.hypervisor, cluster.hvparams[self.instance.hypervisor]) remote_info.Raise("Error checking node %s" % self.cfg.GetNodeName(self.instance.primary_node)) instance_running = bool(remote_info.payload) current_node_uuid = self.instance.primary_node if instance_running and \ self.op.reboot_type in [constants.INSTANCE_REBOOT_SOFT, constants.INSTANCE_REBOOT_HARD]: result = self.rpc.call_instance_reboot(current_node_uuid, self.instance, self.op.reboot_type, self.op.shutdown_timeout, self.op.reason) result.Raise("Could not reboot instance") else: if instance_running: result = self.rpc.call_instance_shutdown( current_node_uuid, self.instance, self.op.shutdown_timeout, self.op.reason) result.Raise("Could not shutdown instance for full reboot") ShutdownInstanceDisks(self, self.instance) self.instance = self.cfg.GetInstanceInfo(self.instance.uuid) else: self.LogInfo("Instance %s was already stopped, starting now", self.instance.name) StartInstanceDisks(self, self.instance, self.op.ignore_secondaries) self.instance = self.cfg.GetInstanceInfo(self.instance.uuid) result = self.rpc.call_instance_start(current_node_uuid, (self.instance, None, None), False, self.op.reason) msg = result.fail_msg if msg: ShutdownInstanceDisks(self, self.instance) self.instance = self.cfg.GetInstanceInfo(self.instance.uuid) raise errors.OpExecError("Could not start instance for" " full reboot: %s" % msg) self.cfg.MarkInstanceUp(self.instance.uuid)