def SetupNodeDaemon(opts, cluster_name, node, ssh_port): """Add a node to the cluster. This function must be called before the actual opcode, and will ssh to the remote node, copy the needed files, and start ganeti-noded, allowing the master to do the rest via normal rpc calls. @param cluster_name: the cluster name @param node: the name of the new node @param ssh_port: the SSH port of the new node """ data = { constants.NDS_CLUSTER_NAME: cluster_name, constants.NDS_NODE_DAEMON_CERTIFICATE: utils.ReadFile(pathutils.NODED_CERT_FILE), constants.NDS_HMAC: utils.ReadFile(pathutils.CONFD_HMAC_KEY), constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(), constants.NDS_START_NODE_DAEMON: True, constants.NDS_NODE_NAME: node, } ssh.RunSshCmdWithStdin(cluster_name, node, pathutils.NODE_DAEMON_SETUP, ssh_port, data, debug=opts.debug, verbose=opts.verbose, use_cluster_key=True, ask_key=opts.ssh_key_check, strict_host_check=opts.ssh_key_check, ensure_version=True) _WaitForSshDaemon(node, ssh_port) _WaitForNodeDaemon(node)
def SetupNodeDaemon(opts, cluster_name, node, ssh_port): """Add a node to the cluster. This function must be called before the actual opcode, and will ssh to the remote node, copy the needed files, and start ganeti-noded, allowing the master to do the rest via normal rpc calls. @param cluster_name: the cluster name @param node: the name of the new node @param ssh_port: the SSH port of the new node """ data = { constants.NDS_CLUSTER_NAME: cluster_name, constants.NDS_NODE_DAEMON_CERTIFICATE: utils.ReadFile(pathutils.NODED_CERT_FILE), constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(), constants.NDS_START_NODE_DAEMON: True, } RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP, opts.debug, opts.verbose, True, opts.ssh_key_check, opts.ssh_key_check, ssh_port, data) _WaitForNodeDaemon(node)
def __init__(self, server_addr, port, volume, _run_cmd=utils.RunCmd, _mount_point=None): """Creates a Gluster volume object. @type server_addr: str @param server_addr: The address to connect to @type port: int @param port: The port to connect to (Gluster standard is 24007) @type volume: str @param volume: The gluster volume to use for storage. """ self.server_addr = server_addr server_ip = netutils.Hostname.GetIP(self.server_addr) self._server_ip = server_ip port = netutils.ValidatePortNumber(port) self._port = port self._volume = volume if _mount_point: # tests self.mount_point = _mount_point else: self.mount_point = ssconf.SimpleStore().GetGlusterStorageDir() self._run_cmd = _run_cmd
def _Connect(sock, address, timeout, allow_non_master): sock.settimeout(timeout) try: sock.connect(address) except socket.timeout as err: raise errors.TimeoutError("Connect timed out: %s" % str(err)) except socket.error as err: error_code = err.args[0] if error_code in (errno.ENOENT, errno.ECONNREFUSED): if not allow_non_master: # Verify if we're actually on the master node before trying # again. ss = ssconf.SimpleStore() try: master, myself = ssconf.GetMasterAndMyself(ss=ss) except ganeti.errors.ConfigurationError: raise errors.NoMasterError(address) if master != myself: raise errors.NoMasterError(address) raise utils.RetryAgain() elif error_code in (errno.EPERM, errno.EACCES): raise errors.PermissionError(address) elif error_code == errno.EAGAIN: # Server's socket backlog is full at the moment raise utils.RetryAgain() raise
def GET(self): """Returns a list of tags. Example: ["tag1", "tag2", "tag3"] """ kind = self.TAG_LEVEL if kind in (constants.TAG_INSTANCE, constants.TAG_NODEGROUP, constants.TAG_NODE, constants.TAG_NETWORK): if not self.name: raise http.HttpBadRequest("Missing name on tag request") cl = self.GetClient() tags = list(cl.QueryTags(kind, self.name)) elif kind == constants.TAG_CLUSTER: assert not self.name # TODO: Use query API? ssc = ssconf.SimpleStore() tags = ssc.GetClusterTags() else: raise http.HttpBadRequest("Unhandled tag type!") return list(tags)
def GetClient(): """Connects to the a luxi socket and returns a client. """ try: client = luxi.Client(address=pathutils.QUERY_SOCKET) except NoMasterError: ss = ssconf.SimpleStore() # Try to read ssconf file try: ss.GetMasterNode() except errors.ConfigurationError: raise errors.OpPrereqError( "Cluster not initialized or this machine is" " not part of a cluster", errors.ECODE_INVAL) master, myself = ssconf.GetMasterAndMyself(ss=ss) if master != myself: raise errors.OpPrereqError( "This is not the master node, please connect" " to node '%s' and rerun the command" % master, errors.ECODE_INVAL) raise return client
def setUp(self): self._tmpdir = tempfile.mkdtemp() self.ssdir = utils.PathJoin(self._tmpdir, "files") lockfile = utils.PathJoin(self._tmpdir, "lock") os.mkdir(self.ssdir) self.sstore = ssconf.SimpleStore(cfg_location=self.ssdir, _lockfile=lockfile)
def GetRunningInstances(): """Compute list of hypervisor/running instances. """ hyp_list = ssconf.SimpleStore().GetHypervisorList() hvparams = ssconf.SimpleStore().GetHvparams() results = [] for hv_name in hyp_list: try: hv = hypervisor.GetHypervisor(hv_name) ilist = hv.ListInstances(hvparams=hvparams) results.extend([(iname, hv_name) for iname in ilist]) except: # pylint: disable=W0702 logging.error( "Error while listing instances for hypervisor %s", hv_name, exc_info=True) return results
def __init__(self, cluster_name): """Initializes this class. @type cluster_name: str @param cluster_name: name of the cluster """ self.cluster_name = cluster_name family = ssconf.SimpleStore().GetPrimaryIPFamily() self.ipv6 = (family == netutils.IP6Address.family)
def Main(): """Main routine. """ opts = ParseOptions() utils.SetupToolLogging(opts.debug, opts.verbose, toolname=os.path.splitext( os.path.basename(__file__))[0]) try: # List of files to delete. Contains tuples consisting of the absolute path # and a boolean denoting whether a backup copy should be created before # deleting. clean_files = [ (pathutils.CONFD_HMAC_KEY, True), (pathutils.CLUSTER_CONF_FILE, True), (pathutils.CLUSTER_DOMAIN_SECRET_FILE, True), ] clean_files.extend(map(lambda s: (s, True), pathutils.ALL_CERT_FILES)) clean_files.extend( map(lambda s: (s, False), ssconf.SimpleStore().GetFileList())) if not opts.yes_do_it: cli.ToStderr( "Cleaning a node is irreversible. If you really want to" " clean this node, supply the --yes-do-it option.") return constants.EXIT_FAILURE logging.info("Stopping daemons") result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"], interactive=True) if result.failed: raise Exception("Could not stop daemons, command '%s' failed: %s" % (result.cmd, result.fail_reason)) for (filename, backup) in clean_files: if os.path.exists(filename): if opts.backup and backup: logging.info("Backing up %s", filename) utils.CreateBackup(filename) logging.info("Removing %s", filename) utils.RemoveFile(filename) logging.info("Node successfully cleaned") except Exception, err: # pylint: disable=W0703 logging.debug("Caught unhandled exception", exc_info=True) (retcode, message) = cli.FormatError(err) logging.error(message) return retcode
def ShouldRun(): """Checks whether node maintenance should run. """ try: return ssconf.SimpleStore().GetMaintainNodeHealth() except errors.ConfigurationError as err: logging.error( "Configuration error, not activating node maintenance: %s", err) return False
def _LoadKnownGroups(): """Returns a list of all node groups known by L{ssconf}. """ groups = ssconf.SimpleStore().GetNodegroupList() result = list(line.split(None, 1)[0] for line in groups if line.strip()) if not compat.all(utils.UUID_RE.match(r) for r in result): raise errors.GenericError("Ssconf contains invalid group UUID") return result
def MajorityHealthy(ignore_offline_nodes=False): """Check if the majority of nodes is healthy Gather master votes from all nodes known to this node; return True if a strict majority of nodes is reachable and has some opinion on which node is master. Note that this will not guarantee any node to win an election but it ensures that a standard master-failover is still possible. @return: tuple of (boolean, [str]); the first is if a majority of nodes are healthy, the second is a list of the node names that are not considered healthy. """ if ignore_offline_nodes: node_names = ssconf.SimpleStore().GetOnlineNodeList() else: node_names = ssconf.SimpleStore().GetNodeList() node_count = len(node_names) vote_list = _GatherMasterVotes(node_names) if not vote_list: logging.warning( ('Voting list was None; cannot determine if a majority of ' 'nodes are healthy')) return (False, node_names) total_votes = sum( [count for (node, count) in vote_list if node is not None]) majority_healthy = 2 * total_votes > node_count # The list of nodes that did not vote is calculated to provide useful # debugging information to the client. voting_nodes = [node for (node, _) in vote_list] nonvoting_nodes = [node for node in node_names if node not in voting_nodes] logging.info("Total %d nodes, %d votes: %s", node_count, total_votes, vote_list) return (majority_healthy, nonvoting_nodes)
def GetMaster(): """Returns the current master node. This is a separate function in bootstrap since it's needed by gnt-cluster, and instead of importing directly ssconf, it's better to abstract it in bootstrap, where we do use ssconf in other functions too. """ sstore = ssconf.SimpleStore() old_master, _ = ssconf.GetMasterAndMyself(sstore) return old_master
def GetConfdClient(callback): """Return a client configured using the given callback. This is handy to abstract the MC list and HMAC key reading. @attention: This should only be called on nodes which are part of a cluster, since it depends on a valid (ganeti) data directory; for code running outside of a cluster, you need to create the client manually """ ss = ssconf.SimpleStore() mc_file = ss.KeyToFilename(constants.SS_MASTER_CANDIDATES_IPS) mc_list = utils.ReadFile(mc_file).splitlines() hmac_key = utils.ReadFile(pathutils.CONFD_HMAC_KEY) return ConfdClient(hmac_key, mc_list, callback)
def SSLVerifyPeer(conn, cert, errnum, errdepth, ok): """Callback function to verify a peer against the candidate cert map. Note that we have a chicken-and-egg problem during cluster init and upgrade. This method checks whether the incoming connection comes from a master candidate by comparing it to the master certificate map in the cluster configuration. However, during cluster init and cluster upgrade there are various RPC calls done to the master node itself, before the candidate certificate list is established and the cluster configuration is written. In this case, we cannot check against the master candidate map. This problem is solved by checking whether the candidate map is empty. An initialized 2.11 or higher cluster has at least one entry for the master node in the candidate map. If the map is empty, we know that we are still in the bootstrap/upgrade phase. In this case, we read the server certificate digest and compare it to the incoming request. This means that after an upgrade of Ganeti, the system continues to operate like before, using server certificates only. After the client certificates are generated with ``gnt-cluster renew-crypto --new-node-certificates``, RPC communication is switched to using client certificates and the trick of using server certificates does not work anymore. @type conn: C{OpenSSL.SSL.Connection} @param conn: the OpenSSL connection object @type cert: C{OpenSSL.X509} @param cert: the peer's SSL certificate """ # some parameters are unused, but this is the API # pylint: disable=W0613 _BOOTSTRAP = "bootstrap" sstore = ssconf.SimpleStore() try: candidate_certs = sstore.GetMasterCandidatesCertMap() except errors.ConfigurationError: logging.info("No candidate certificates found. Switching to " "bootstrap/update mode.") candidate_certs = None if not candidate_certs: candidate_certs = { _BOOTSTRAP: utils.GetCertificateDigest(cert_filename=pathutils.NODED_CERT_FILE) } return cert.digest("sha1") in candidate_certs.values()
def GetClient(query=True): """Connects to the a luxi socket and returns a client. @type query: boolean @param query: this signifies that the client will only be used for queries; if the build-time parameter enable-split-queries is enabled, then the client will be connected to the query socket instead of the masterd socket """ override_socket = os.getenv(constants.LUXI_OVERRIDE, "") if override_socket: if override_socket == constants.LUXI_OVERRIDE_MASTER: address = pathutils.MASTER_SOCKET elif override_socket == constants.LUXI_OVERRIDE_QUERY: address = pathutils.QUERY_SOCKET else: address = override_socket elif query: address = pathutils.QUERY_SOCKET else: address = None # TODO: Cache object? try: client = luxi.Client(address=address) except NoMasterError: ss = ssconf.SimpleStore() # Try to read ssconf file try: ss.GetMasterNode() except errors.ConfigurationError: raise errors.OpPrereqError( "Cluster not initialized or this machine is" " not part of a cluster", errors.ECODE_INVAL) master, myself = ssconf.GetMasterAndMyself(ss=ss) if master != myself: raise errors.OpPrereqError( "This is not the master node, please connect" " to node '%s' and rerun the command" % master, errors.ECODE_INVAL) raise return client
def MajorityHealthy(): """Check if the majority of nodes is healthy Gather master votes from all nodes known to this node; return True if a strict majority of nodes is reachable and has some opinion on which node is master. Note that this will not guarantee any node to win an election but it ensures that a standard master-failover is still possible. """ node_names = ssconf.SimpleStore().GetNodeList() node_count = len(node_names) vote_list = GatherMasterVotes(node_names) if vote_list is None: return False total_votes = sum([count for (node, count) in vote_list if node is not None]) logging.info("Total %d nodes, %d votes: %s", node_count, total_votes, vote_list) return 2 * total_votes > node_count
def _WaitForSshDaemon(hostname, port): """Wait for SSH daemon to become responsive. """ family = ssconf.SimpleStore().GetPrimaryIPFamily() hostip = netutils.GetHostname(name=hostname, family=family).ip def _CheckSshDaemon(): if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True): logging.debug("SSH daemon on %s:%s (IP address %s) has become" " responsive", hostname, port, hostip) else: raise utils.RetryAgain() try: utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT) except utils.RetryTimeout: raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't" " become responsive within %s seconds" % (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
def ComputeAncillaryFiles(cluster, redist): """Compute files external to Ganeti which need to be consistent. @type redist: boolean @param redist: Whether to include files which need to be redistributed """ # Compute files for all nodes files_all = set([ pathutils.SSH_KNOWN_HOSTS_FILE, pathutils.CONFD_HMAC_KEY, pathutils.CLUSTER_DOMAIN_SECRET_FILE, pathutils.SPICE_CERT_FILE, pathutils.SPICE_CACERT_FILE, pathutils.RAPI_USERS_FILE, ]) if redist: # we need to ship at least the RAPI certificate files_all.add(pathutils.RAPI_CERT_FILE) else: files_all.update(pathutils.ALL_CERT_FILES) files_all.update(ssconf.SimpleStore().GetFileList()) if cluster.modify_etc_hosts: files_all.add(pathutils.ETC_HOSTS) if cluster.use_external_mip_script: files_all.add(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) # Files which are optional, these must: # - be present in one other category as well # - either exist or not exist on all nodes of that category (mc, vm all) files_opt = set([ pathutils.RAPI_USERS_FILE, ]) # Files which should only be on master candidates files_mc = set() if not redist: files_mc.add(pathutils.CLUSTER_CONF_FILE) # File storage if (not redist and (cluster.IsFileStorageEnabled() or cluster.IsSharedFileStorageEnabled())): files_all.add(pathutils.FILE_STORAGE_PATHS_FILE) files_opt.add(pathutils.FILE_STORAGE_PATHS_FILE) # Files which should only be on VM-capable nodes files_vm = set(filename for hv_name in cluster.enabled_hypervisors for filename in hypervisor.GetHypervisorClass( hv_name).GetAncillaryFiles()[0]) files_opt |= set(filename for hv_name in cluster.enabled_hypervisors for filename in hypervisor.GetHypervisorClass( hv_name).GetAncillaryFiles()[1]) # Filenames in each category must be unique all_files_set = files_all | files_mc | files_vm assert (len(all_files_set) == sum(map(len, [files_all, files_mc, files_vm]))), \ "Found file listed in more than one file list" # Optional files must be present in one other category assert all_files_set.issuperset(files_opt), \ "Optional file not in a different required list" # This one file should never ever be re-distributed via RPC assert not (redist and pathutils.FILE_STORAGE_PATHS_FILE in all_files_set) return (files_all, files_opt, files_mc, files_vm)
def MasterFailover(no_voting=False): """Failover the master node. This checks that we are not already the master, and will cause the current master to cease being master, and the non-master to become new master. @type no_voting: boolean @param no_voting: force the operation without remote nodes agreement (dangerous) @returns: the pair of an exit code and warnings to display """ sstore = ssconf.SimpleStore() old_master, new_master = ssconf.GetMasterAndMyself(sstore) node_names = sstore.GetNodeList() mc_list = sstore.GetMasterCandidates() if old_master == new_master: raise errors.OpPrereqError( "This commands must be run on the node" " where you want the new master to be." " %s is already the master" % old_master, errors.ECODE_INVAL) if new_master not in mc_list: mc_no_master = [name for name in mc_list if name != old_master] raise errors.OpPrereqError( "This node is not among the nodes marked" " as master candidates. Only these nodes" " can become masters. Current list of" " master candidates is:\n" "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE) if not no_voting: vote_list = GatherMasterVotes(node_names) if vote_list: voted_master = vote_list[0][0] if voted_master is None: raise errors.OpPrereqError( "Cluster is inconsistent, most nodes did" " not respond.", errors.ECODE_ENVIRON) elif voted_master != old_master: raise errors.OpPrereqError( "I have a wrong configuration, I believe" " the master is %s but the other nodes" " voted %s. Please resync the configuration" " of this node." % (old_master, voted_master), errors.ECODE_STATE) # end checks rcode = 0 warnings = [] logging.info("Setting master to %s, old master: %s", new_master, old_master) try: # Forcefully start WConfd so that we can access the configuration result = utils.RunCmd([ pathutils.DAEMON_UTIL, "start", constants.WCONFD, "--force-node", "--no-voting", "--yes-do-it" ]) if result.failed: raise errors.OpPrereqError( "Could not start the configuration daemon," " command %s had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output), errors.ECODE_NOENT) # instantiate a real config writer, as we now know we have the # configuration data livelock = utils.livelock.LiveLock("bootstrap_failover") cfg = config.GetConfig(None, livelock, accept_foreign=True) old_master_node = cfg.GetNodeInfoByName(old_master) if old_master_node is None: raise errors.OpPrereqError( "Could not find old master node '%s' in" " cluster configuration." % old_master, errors.ECODE_NOENT) cluster_info = cfg.GetClusterInfo() new_master_node = cfg.GetNodeInfoByName(new_master) if new_master_node is None: raise errors.OpPrereqError( "Could not find new master node '%s' in" " cluster configuration." % new_master, errors.ECODE_NOENT) cluster_info.master_node = new_master_node.uuid # this will also regenerate the ssconf files, since we updated the # cluster info cfg.Update(cluster_info, logging.error) # if cfg.Update worked, then it means the old master daemon won't be # able now to write its own config file (we rely on locking in both # backend.UploadFile() and ConfigWriter._Write(); hence the next # step is to kill the old master logging.info("Stopping the master daemon on node %s", old_master) runner = rpc.BootstrapRunner() master_params = cfg.GetMasterNetworkParameters() master_params.uuid = old_master_node.uuid ems = cfg.GetUseExternalMipScript() result = runner.call_node_deactivate_master_ip(old_master, master_params, ems) msg = result.fail_msg if msg: warning = "Could not disable the master IP: %s" % (msg, ) logging.warning("%s", warning) warnings.append(warning) result = runner.call_node_stop_master(old_master) msg = result.fail_msg if msg: warning = ("Could not disable the master role on the old master" " %s, please disable manually: %s" % (old_master, msg)) logging.error("%s", warning) warnings.append(warning) except errors.ConfigurationError, err: logging.error("Error while trying to set the new master: %s", str(err)) return 1, warnings
def GetPaths(): """Returns a tuple of path objects to process. """ getent = runtime.GetEnts() masterd_log = constants.DAEMONS_LOGFILES[constants.MASTERD] noded_log = constants.DAEMONS_LOGFILES[constants.NODED] confd_log = constants.DAEMONS_LOGFILES[constants.CONFD] luxid_log = constants.DAEMONS_LOGFILES[constants.LUXID] rapi_log = constants.DAEMONS_LOGFILES[constants.RAPI] mond_log = constants.DAEMONS_LOGFILES[constants.MOND] rapi_dir = os.path.join(pathutils.DATA_DIR, "rapi") cleaner_log_dir = os.path.join(pathutils.LOG_DIR, "cleaner") master_cleaner_log_dir = os.path.join(pathutils.LOG_DIR, "master-cleaner") # A note on the ordering: The parent directory (type C{DIR}) must always be # listed before files (type C{FILE}) in that directory. Once the directory is # set, only files directly in that directory can be listed. paths = [ (pathutils.DATA_DIR, DIR, 0755, getent.masterd_uid, getent.masterd_gid), (pathutils.CLUSTER_DOMAIN_SECRET_FILE, FILE, 0640, getent.masterd_uid, getent.masterd_gid, False), (pathutils.CLUSTER_CONF_FILE, FILE, 0640, getent.masterd_uid, getent.confd_gid, False), (pathutils.CONFD_HMAC_KEY, FILE, 0440, getent.confd_uid, getent.masterd_gid, False), (pathutils.SSH_KNOWN_HOSTS_FILE, FILE, 0644, getent.masterd_uid, getent.masterd_gid, False), (pathutils.RAPI_CERT_FILE, FILE, 0440, getent.rapi_uid, getent.masterd_gid, False), (pathutils.SPICE_CERT_FILE, FILE, 0440, getent.noded_uid, getent.masterd_gid, False), (pathutils.SPICE_CACERT_FILE, FILE, 0440, getent.noded_uid, getent.masterd_gid, False), (pathutils.NODED_CERT_FILE, FILE, pathutils.NODED_CERT_MODE, getent.masterd_uid, getent.masterd_gid, False), (pathutils.NODED_CLIENT_CERT_FILE, FILE, pathutils.NODED_CERT_MODE, getent.masterd_uid, getent.masterd_gid, False), (pathutils.WATCHER_PAUSEFILE, FILE, 0644, getent.masterd_uid, getent.masterd_gid, False), ] ss = ssconf.SimpleStore() for ss_path in ss.GetFileList(): paths.append((ss_path, FILE, constants.SS_FILE_PERMS, getent.noded_uid, getent.noded_gid, False)) paths.extend([ (pathutils.QUEUE_DIR, DIR, 0750, getent.masterd_uid, getent.daemons_gid), (pathutils.QUEUE_DIR, QUEUE_DIR, constants.JOB_QUEUE_FILES_PERMS, getent.masterd_uid, getent.daemons_gid), (pathutils.JOB_QUEUE_DRAIN_FILE, FILE, 0644, getent.masterd_uid, getent.daemons_gid, False), (pathutils.JOB_QUEUE_LOCK_FILE, FILE, constants.JOB_QUEUE_FILES_PERMS, getent.masterd_uid, getent.daemons_gid, False), (pathutils.JOB_QUEUE_SERIAL_FILE, FILE, constants.JOB_QUEUE_FILES_PERMS, getent.masterd_uid, getent.daemons_gid, False), (pathutils.JOB_QUEUE_VERSION_FILE, FILE, constants.JOB_QUEUE_FILES_PERMS, getent.masterd_uid, getent.daemons_gid, False), (pathutils.JOB_QUEUE_ARCHIVE_DIR, DIR, 0750, getent.masterd_uid, getent.daemons_gid), (rapi_dir, DIR, 0750, getent.rapi_uid, getent.masterd_gid), (pathutils.RAPI_USERS_FILE, FILE, 0640, getent.rapi_uid, getent.masterd_gid, False), (pathutils.RUN_DIR, DIR, 0775, getent.masterd_uid, getent.daemons_gid), (pathutils.SOCKET_DIR, DIR, 0770, getent.masterd_uid, getent.daemons_gid), (pathutils.MASTER_SOCKET, FILE, 0660, getent.masterd_uid, getent.daemons_gid, False), (pathutils.QUERY_SOCKET, FILE, 0660, getent.luxid_uid, getent.daemons_gid, False), (pathutils.BDEV_CACHE_DIR, DIR, 0755, getent.noded_uid, getent.masterd_gid), (pathutils.UIDPOOL_LOCKDIR, DIR, 0750, getent.noded_uid, getent.masterd_gid), (pathutils.DISK_LINKS_DIR, DIR, 0755, getent.noded_uid, getent.masterd_gid), (pathutils.CRYPTO_KEYS_DIR, DIR, 0700, getent.noded_uid, getent.masterd_gid), (pathutils.IMPORT_EXPORT_DIR, DIR, 0755, getent.noded_uid, getent.masterd_gid), (pathutils.LOG_DIR, DIR, 0770, getent.masterd_uid, getent.daemons_gid), (masterd_log, FILE, 0600, getent.masterd_uid, getent.masterd_gid, False), (confd_log, FILE, 0600, getent.confd_uid, getent.masterd_gid, False), (luxid_log, FILE, 0600, getent.luxid_uid, getent.masterd_gid, False), (noded_log, FILE, 0600, getent.noded_uid, getent.masterd_gid, False), (rapi_log, FILE, 0600, getent.rapi_uid, getent.masterd_gid, False), (mond_log, FILE, 0600, getent.mond_uid, getent.masterd_gid, False), (pathutils.LOG_OS_DIR, DIR, 0750, getent.noded_uid, getent.daemons_gid), (pathutils.LOG_XEN_DIR, DIR, 0750, getent.noded_uid, getent.daemons_gid), (cleaner_log_dir, DIR, 0750, getent.noded_uid, getent.noded_gid), (master_cleaner_log_dir, DIR, 0750, getent.masterd_uid, getent.masterd_gid), (pathutils.INSTANCE_REASON_DIR, DIR, 0755, getent.noded_uid, getent.noded_gid), (pathutils.LIVELOCK_DIR, DIR, 0750, getent.masterd_uid, getent.daemons_gid), (pathutils.LUXID_MESSAGE_DIR, DIR, 0750, getent.masterd_uid, getent.daemons_gid), ]) return paths
def MasterFailover(no_voting=False): """Failover the master node. This checks that we are not already the master, and will cause the current master to cease being master, and the non-master to become new master. Note: The call to MasterFailover from lib/client/gnt_cluster.py checks that a majority of nodes are healthy and responding before calling this. If this function is called from somewhere else, the caller should also verify that a majority of nodes are healthy. @type no_voting: boolean @param no_voting: force the operation without remote nodes agreement (dangerous) @returns: the pair of an exit code and warnings to display """ sstore = ssconf.SimpleStore() old_master, new_master = ssconf.GetMasterAndMyself(sstore) node_names = sstore.GetNodeList() mc_list = sstore.GetMasterCandidates() if old_master == new_master: raise errors.OpPrereqError( "This commands must be run on the node" " where you want the new master to be." " %s is already the master" % old_master, errors.ECODE_INVAL) if new_master not in mc_list: mc_no_master = [name for name in mc_list if name != old_master] raise errors.OpPrereqError( "This node is not among the nodes marked" " as master candidates. Only these nodes" " can become masters. Current list of" " master candidates is:\n" "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE) if not no_voting: vote_list = _GatherMasterVotes(node_names) if vote_list: voted_master = vote_list[0][0] if voted_master != old_master: raise errors.OpPrereqError( "I have a wrong configuration, I believe" " the master is %s but the other nodes" " voted %s. Please resync the configuration" " of this node." % (old_master, voted_master), errors.ECODE_STATE) # end checks rcode = 0 warnings = [] logging.info("Setting master to %s, old master: %s", new_master, old_master) try: # Forcefully start WConfd so that we can access the configuration result = utils.RunCmd([ pathutils.DAEMON_UTIL, "start", constants.WCONFD, "--force-node", "--no-voting", "--yes-do-it" ]) if result.failed: raise errors.OpPrereqError( "Could not start the configuration daemon," " command %s had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output), errors.ECODE_NOENT) # instantiate a real config writer, as we now know we have the # configuration data livelock = utils.livelock.LiveLock("bootstrap_failover") cfg = config.GetConfig(None, livelock, accept_foreign=True) old_master_node = cfg.GetNodeInfoByName(old_master) if old_master_node is None: raise errors.OpPrereqError( "Could not find old master node '%s' in" " cluster configuration." % old_master, errors.ECODE_NOENT) cluster_info = cfg.GetClusterInfo() new_master_node = cfg.GetNodeInfoByName(new_master) if new_master_node is None: raise errors.OpPrereqError( "Could not find new master node '%s' in" " cluster configuration." % new_master, errors.ECODE_NOENT) cluster_info.master_node = new_master_node.uuid # this will also regenerate the ssconf files, since we updated the # cluster info cfg.Update(cluster_info, logging.error) # if cfg.Update worked, then it means the old master daemon won't be # able now to write its own config file (we rely on locking in both # backend.UploadFile() and ConfigWriter._Write(); hence the next # step is to kill the old master logging.info("Stopping the master daemon on node %s", old_master) runner = rpc.BootstrapRunner() master_params = cfg.GetMasterNetworkParameters() master_params.uuid = old_master_node.uuid ems = cfg.GetUseExternalMipScript() result = runner.call_node_deactivate_master_ip(old_master, master_params, ems) msg = result.fail_msg if msg: warning = "Could not disable the master IP: %s" % (msg, ) logging.warning("%s", warning) warnings.append(warning) result = runner.call_node_stop_master(old_master) msg = result.fail_msg if msg: warning = ("Could not disable the master role on the old master" " %s, please disable manually: %s" % (old_master, msg)) logging.error("%s", warning) warnings.append(warning) except errors.ConfigurationError as err: logging.error("Error while trying to set the new master: %s", str(err)) return 1, warnings finally: # stop WConfd again: result = utils.RunCmd( [pathutils.DAEMON_UTIL, "stop", constants.WCONFD]) if result.failed: warning = ("Could not stop the configuration daemon," " command %s had exitcode %s and error %s" % (result.cmd, result.exit_code, result.output)) logging.error("%s", warning) rcode = 1 logging.info("Checking master IP non-reachability...") master_ip = sstore.GetMasterIP() total_timeout = 30 # Here we have a phase where no master should be running def _check_ip(expected): if netutils.TcpPing(master_ip, constants.DEFAULT_NODED_PORT) != expected: raise utils.RetryAgain() try: utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False]) except utils.RetryTimeout: warning = ("The master IP is still reachable after %s seconds," " continuing but activating the master IP on the current" " node will probably fail" % total_timeout) logging.warning("%s", warning) warnings.append(warning) rcode = 1 if jstore.CheckDrainFlag(): logging.info("Undraining job queue") jstore.SetDrainFlag(False) logging.info("Starting the master daemons on the new master") result = rpc.BootstrapRunner().call_node_start_master_daemons( new_master, no_voting) msg = result.fail_msg if msg: logging.error( "Could not start the master role on the new master" " %s, please check: %s", new_master, msg) rcode = 1 # Finally verify that the new master managed to set up the master IP # and warn if it didn't. try: utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[True]) except utils.RetryTimeout: warning = ("The master IP did not come up within %s seconds; the" " cluster should still be working and reachable via %s," " but not via the master IP address" % (total_timeout, new_master)) logging.warning("%s", warning) warnings.append(warning) rcode = 1 logging.info("Master failed over from %s to %s", old_master, new_master) return rcode, warnings
def MasterFailover(no_voting=False): """Failover the master node. This checks that we are not already the master, and will cause the current master to cease being master, and the non-master to become new master. @type no_voting: boolean @param no_voting: force the operation without remote nodes agreement (dangerous) """ sstore = ssconf.SimpleStore() old_master, new_master = ssconf.GetMasterAndMyself(sstore) node_names = sstore.GetNodeList() mc_list = sstore.GetMasterCandidates() if old_master == new_master: raise errors.OpPrereqError( "This commands must be run on the node" " where you want the new master to be." " %s is already the master" % old_master, errors.ECODE_INVAL) if new_master not in mc_list: mc_no_master = [name for name in mc_list if name != old_master] raise errors.OpPrereqError( "This node is not among the nodes marked" " as master candidates. Only these nodes" " can become masters. Current list of" " master candidates is:\n" "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE) if not no_voting: vote_list = GatherMasterVotes(node_names) if vote_list: voted_master = vote_list[0][0] if voted_master is None: raise errors.OpPrereqError( "Cluster is inconsistent, most nodes did" " not respond.", errors.ECODE_ENVIRON) elif voted_master != old_master: raise errors.OpPrereqError( "I have a wrong configuration, I believe" " the master is %s but the other nodes" " voted %s. Please resync the configuration" " of this node." % (old_master, voted_master), errors.ECODE_STATE) # end checks rcode = 0 logging.info("Setting master to %s, old master: %s", new_master, old_master) try: # instantiate a real config writer, as we now know we have the # configuration data cfg = config.ConfigWriter(accept_foreign=True) old_master_node = cfg.GetNodeInfoByName(old_master) if old_master_node is None: raise errors.OpPrereqError( "Could not find old master node '%s' in" " cluster configuration." % old_master, errors.ECODE_NOENT) cluster_info = cfg.GetClusterInfo() new_master_node = cfg.GetNodeInfoByName(new_master) if new_master_node is None: raise errors.OpPrereqError( "Could not find new master node '%s' in" " cluster configuration." % new_master, errors.ECODE_NOENT) cluster_info.master_node = new_master_node.uuid # this will also regenerate the ssconf files, since we updated the # cluster info cfg.Update(cluster_info, logging.error) except errors.ConfigurationError, err: logging.error("Error while trying to set the new master: %s", str(err)) return 1
def SSLVerifyPeer(conn, cert, errnum, errdepth, ok): """Callback function to verify a peer against the candidate cert map. Note that we have a chicken-and-egg problem during cluster init and upgrade. This method checks whether the incoming connection comes from a master candidate by comparing it to the master certificate map in the cluster configuration. However, during cluster init and cluster upgrade there are various RPC calls done to the master node itself, before the candidate certificate list is established and the cluster configuration is written. In this case, we cannot check against the master candidate map. This problem is solved by checking whether the candidate map is empty. An initialized 2.11 or higher cluster has at least one entry for the master node in the candidate map. If the map is empty, we know that we are still in the bootstrap/upgrade phase. In this case, we read the server certificate digest and compare it to the incoming request. This means that after an upgrade of Ganeti, the system continues to operate like before, using server certificates only. After the client certificates are generated with ``gnt-cluster renew-crypto --new-node-certificates``, RPC communication is switched to using client certificates and the trick of using server certificates does not work anymore. @type conn: C{OpenSSL.SSL.Connection} @param conn: the OpenSSL connection object @type cert: C{OpenSSL.X509} @param cert: the peer's SSL certificate @type errdepth: integer @param errdepth: number of the step in the certificate chain starting at 0 for the actual client certificate. """ # some parameters are unused, but this is the API # pylint: disable=W0613 # If we receive a certificate from the certificate chain that is higher # than the lowest element of the chain, we have to check it against the # server certificate. if errdepth > 0: server_digest = utils.GetCertificateDigest( cert_filename=pathutils.NODED_CERT_FILE) match = cert.digest("sha1") == server_digest if not match: logging.debug( "Received certificate from the certificate chain, which" " does not match the server certficate. Digest of the" " received certificate: %s. Digest of the server" " certificate: %s.", cert.digest("sha1"), server_digest) return match elif errdepth == 0: sstore = ssconf.SimpleStore() try: candidate_certs = sstore.GetMasterCandidatesCertMap() except errors.ConfigurationError: logging.info("No candidate certificates found. Switching to " "bootstrap/update mode.") candidate_certs = None if not candidate_certs: candidate_certs = { constants.CRYPTO_BOOTSTRAP: utils.GetCertificateDigest( cert_filename=pathutils.NODED_CERT_FILE) } match = cert.digest("sha1") in candidate_certs.values() if not match: logging.debug( "Received certificate which is not a certificate of a" " master candidate. Certificate digest: %s. List of master" " candidate certificate digests: %s.", cert.digest("sha1"), str(candidate_certs)) return match else: logging.error("Invalid errdepth value: %s.", errdepth) return False
def GenericMain(daemon_name, optionparser, check_fn, prepare_fn, exec_fn, multithreaded=False, console_logging=False, default_ssl_cert=None, default_ssl_key=None, warn_breach=False): """Shared main function for daemons. @type daemon_name: string @param daemon_name: daemon name @type optionparser: optparse.OptionParser @param optionparser: initialized optionparser with daemon-specific options (common -f -d options will be handled by this module) @type check_fn: function which accepts (options, args) @param check_fn: function that checks start conditions and exits if they're not met @type prepare_fn: function which accepts (options, args) @param prepare_fn: function that is run before forking, or None; it's result will be passed as the third parameter to exec_fn, or if None was passed in, we will just pass None to exec_fn @type exec_fn: function which accepts (options, args, prepare_results) @param exec_fn: function that's executed with the daemon's pid file held, and runs the daemon itself. @type multithreaded: bool @param multithreaded: Whether the daemon uses threads @type console_logging: boolean @param console_logging: if True, the daemon will fall back to the system console if logging fails @type default_ssl_cert: string @param default_ssl_cert: Default SSL certificate path @type default_ssl_key: string @param default_ssl_key: Default SSL key path @type warn_breach: bool @param warn_breach: issue a warning at daemon launch time, before daemonizing, about the possibility of breaking parameter privacy invariants through the otherwise helpful debug logging. """ optionparser.add_option("-f", "--foreground", dest="fork", help="Don't detach from the current terminal", default=True, action="store_false") optionparser.add_option("-d", "--debug", dest="debug", help="Enable some debug messages", default=False, action="store_true") optionparser.add_option("--syslog", dest="syslog", help="Enable logging to syslog (except debug" " messages); one of 'no', 'yes' or 'only' [%s]" % constants.SYSLOG_USAGE, default=constants.SYSLOG_USAGE, choices=["no", "yes", "only"]) family = ssconf.SimpleStore().GetPrimaryIPFamily() # family will default to AF_INET if there is no ssconf file (e.g. when # upgrading a cluster from 2.2 -> 2.3. This is intended, as Ganeti clusters # <= 2.2 can not be AF_INET6 if daemon_name in constants.DAEMONS_PORTS: default_bind_address = constants.IP4_ADDRESS_ANY if family == netutils.IP6Address.family: default_bind_address = constants.IP6_ADDRESS_ANY default_port = netutils.GetDaemonPort(daemon_name) # For networked daemons we allow choosing the port and bind address optionparser.add_option("-p", "--port", dest="port", help="Network port (default: %s)" % default_port, default=default_port, type="int") optionparser.add_option("-b", "--bind", dest="bind_address", help=("Bind address (default: '%s')" % default_bind_address), default=default_bind_address, metavar="ADDRESS") optionparser.add_option("-i", "--interface", dest="bind_interface", help=("Bind interface"), metavar="INTERFACE") if default_ssl_key is not None and default_ssl_cert is not None: optionparser.add_option("--no-ssl", dest="ssl", help="Do not secure HTTP protocol with SSL", default=True, action="store_false") optionparser.add_option("-K", "--ssl-key", dest="ssl_key", help=("SSL key path (default: %s)" % default_ssl_key), default=default_ssl_key, type="string", metavar="SSL_KEY_PATH") optionparser.add_option("-C", "--ssl-cert", dest="ssl_cert", help=("SSL certificate path (default: %s)" % default_ssl_cert), default=default_ssl_cert, type="string", metavar="SSL_CERT_PATH") # Disable the use of fork(2) if the daemon uses threads if multithreaded: utils.DisableFork() options, args = optionparser.parse_args() if getattr(options, "bind_interface", None) is not None: if options.bind_address != default_bind_address: msg = ( "Can't specify both, bind address (%s) and bind interface (%s)" % (options.bind_address, options.bind_interface)) print(msg, file=sys.stderr) sys.exit(constants.EXIT_FAILURE) interface_ip_addresses = \ netutils.GetInterfaceIpAddresses(options.bind_interface) if family == netutils.IP6Address.family: if_addresses = interface_ip_addresses[constants.IP6_VERSION] else: if_addresses = interface_ip_addresses[constants.IP4_VERSION] if len(if_addresses) < 1: msg = "Failed to find IP for interface %s" % options.bind_interace print(msg, file=sys.stderr) sys.exit(constants.EXIT_FAILURE) options.bind_address = if_addresses[0] if getattr(options, "ssl", False): ssl_paths = { "certificate": options.ssl_cert, "key": options.ssl_key, } for name, path in ssl_paths.items(): if not os.path.isfile(path): print("SSL %s file '%s' was not found" % (name, path), file=sys.stderr) sys.exit(constants.EXIT_FAILURE) # TODO: By initiating http.HttpSslParams here we would only read the files # once and have a proper validation (isfile returns False on directories) # at the same time. result, running_uid, expected_uid = _VerifyDaemonUser(daemon_name) if not result: msg = ("%s started using wrong user ID (%d), expected %d" % (daemon_name, running_uid, expected_uid)) print(msg, file=sys.stderr) sys.exit(constants.EXIT_FAILURE) if check_fn is not None: check_fn(options, args) log_filename = constants.DAEMONS_LOGFILES[daemon_name] # node-daemon logging in lib/http/server.py, _HandleServerRequestInner if options.debug and warn_breach: sys.stderr.write(constants.DEBUG_MODE_CONFIDENTIALITY_WARNING % daemon_name) if options.fork: # Newer GnuTLS versions (>= 3.3.0) use a library constructor for # initialization and open /dev/urandom on library load time, way before we # fork(). Closing /dev/urandom causes subsequent ganeti.http.client # requests to fail and the process to receive a SIGABRT. As we cannot # reliably detect GnuTLS's socket, we work our way around this by keeping # all fds referring to /dev/urandom open. noclose_fds = [] for fd in os.listdir("/proc/self/fd"): try: if os.readlink(os.path.join("/proc/self/fd", fd)) == "/dev/urandom": noclose_fds.append(int(fd)) except EnvironmentError: # The fd might have disappeared (although it shouldn't as we're running # single-threaded). continue utils.CloseFDs(noclose_fds=noclose_fds) (wpipe, stdio_reopen_fn) = utils.Daemonize(logfile=log_filename) else: (wpipe, stdio_reopen_fn) = (None, None) log_reopen_fn = \ utils.SetupLogging(log_filename, daemon_name, debug=options.debug, stderr_logging=not options.fork, multithreaded=multithreaded, syslog=options.syslog, console_logging=console_logging) # Reopen log file(s) on SIGHUP signal.signal( signal.SIGHUP, compat.partial(_HandleSigHup, [log_reopen_fn, stdio_reopen_fn])) try: utils.WritePidFile(utils.DaemonPidFileName(daemon_name)) except errors.PidFileLockError as err: print("Error while locking PID file:\n%s" % err, file=sys.stderr) sys.exit(constants.EXIT_FAILURE) try: try: logging.info("%s daemon startup", daemon_name) if callable(prepare_fn): prep_results = prepare_fn(options, args) else: prep_results = None except Exception as err: utils.WriteErrorToFD(wpipe, _BeautifyError(err)) raise if wpipe is not None: # we're done with the preparation phase, we close the pipe to # let the parent know it's safe to exit os.close(wpipe) exec_fn(options, args, prep_results) finally: utils.RemoveFile(utils.DaemonPidFileName(daemon_name))
def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose, use_cluster_key, ask_key, strict_host_check, port, data): """Runs a command to configure something on a remote machine. @type cluster_name: string @param cluster_name: Cluster name @type node: string @param node: Node name @type basecmd: string @param basecmd: Base command (path on the remote machine) @type debug: bool @param debug: Enable debug output @type verbose: bool @param verbose: Enable verbose output @type use_cluster_key: bool @param use_cluster_key: See L{ssh.SshRunner.BuildCmd} @type ask_key: bool @param ask_key: See L{ssh.SshRunner.BuildCmd} @type strict_host_check: bool @param strict_host_check: See L{ssh.SshRunner.BuildCmd} @type port: int @param port: The SSH port of the remote machine or None for the default @param data: JSON-serializable input data for script (passed to stdin) """ cmd = [basecmd] # Pass --debug/--verbose to the external script if set on our invocation if debug: cmd.append("--debug") if verbose: cmd.append("--verbose") logging.debug("Node setup command: %s", cmd) version = constants.DIR_VERSION all_cmds = [["test", "-d", os.path.join(pathutils.PKGLIBDIR, version)]] if constants.HAS_GNU_LN: all_cmds.extend([["ln", "-s", "-f", "-T", os.path.join(pathutils.PKGLIBDIR, version), os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")], ["ln", "-s", "-f", "-T", os.path.join(pathutils.SHAREDIR, version), os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]]) else: all_cmds.extend([["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")], ["ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, version), os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")], ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")], ["ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, version), os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]]) all_cmds.append(cmd) if port is None: port = netutils.GetDaemonPort(constants.SSH) family = ssconf.SimpleStore().GetPrimaryIPFamily() srun = ssh.SshRunner(cluster_name, ipv6=(family == netutils.IP6Address.family)) scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER, utils.ShellQuoteArgs( utils.ShellCombineCommands(all_cmds)), batch=False, ask_key=ask_key, quiet=False, strict_host_check=strict_host_check, use_cluster_key=use_cluster_key, port=port) tempfh = tempfile.TemporaryFile() try: tempfh.write(serializer.DumpJson(data)) tempfh.seek(0) result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh) finally: tempfh.close() if result.failed: raise errors.OpExecError("Command '%s' failed: %s" % (result.cmd, result.fail_reason)) _WaitForSshDaemon(node, port, family)
class Transport: """Low-level transport class. This is used on the client side. This could be replaced by any other class that provides the same semantics to the Client. This means: - can send messages and receive messages - safe for multithreading """ def __init__(self, address, timeouts=None): """Constructor for the Client class. Arguments: - address: a valid address the the used transport class - timeout: a list of timeouts, to be used on connect and read/write There are two timeouts used since we might want to wait for a long time for a response, but the connect timeout should be lower. If not passed, we use a default of 10 and respectively 60 seconds. Note that on reading data, since the timeout applies to an invidual receive, it might be that the total duration is longer than timeout value passed (we make a hard limit at twice the read timeout). """ self.address = address if timeouts is None: self._ctimeout, self._rwtimeout = DEF_CTMO, DEF_RWTO else: self._ctimeout, self._rwtimeout = timeouts self.socket = None self._buffer = "" self._msgs = collections.deque() try: self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) # Try to connect try: utils.Retry(self._Connect, 1.0, self._ctimeout, args=(self.socket, address, self._ctimeout)) except utils.RetryTimeout: raise errors.TimeoutError("Connect timed out") self.socket.settimeout(self._rwtimeout) except (socket.error, errors.NoMasterError): if self.socket is not None: self.socket.close() self.socket = None raise @staticmethod def _Connect(sock, address, timeout): sock.settimeout(timeout) try: sock.connect(address) except socket.timeout, err: raise errors.TimeoutError("Connect timed out: %s" % str(err)) except socket.error, err: error_code = err.args[0] if error_code in (errno.ENOENT, errno.ECONNREFUSED): # Verify if we're acutally on the master node before trying # again. ss = ssconf.SimpleStore() try: master, myself = ssconf.GetMasterAndMyself(ss=ss) except ganeti.errors.ConfigurationError: raise errors.NoMasterError(address) if master != myself: raise errors.NoMasterError(address) raise utils.RetryAgain() elif error_code in (errno.EPERM, errno.EACCES): raise errors.PermissionError(address) elif error_code == errno.EAGAIN: # Server's socket backlog is full at the moment raise utils.RetryAgain() raise
def GenericMain(daemon_name, optionparser, check_fn, prepare_fn, exec_fn, multithreaded=False, console_logging=False, default_ssl_cert=None, default_ssl_key=None): """Shared main function for daemons. @type daemon_name: string @param daemon_name: daemon name @type optionparser: optparse.OptionParser @param optionparser: initialized optionparser with daemon-specific options (common -f -d options will be handled by this module) @type check_fn: function which accepts (options, args) @param check_fn: function that checks start conditions and exits if they're not met @type prepare_fn: function which accepts (options, args) @param prepare_fn: function that is run before forking, or None; it's result will be passed as the third parameter to exec_fn, or if None was passed in, we will just pass None to exec_fn @type exec_fn: function which accepts (options, args, prepare_results) @param exec_fn: function that's executed with the daemon's pid file held, and runs the daemon itself. @type multithreaded: bool @param multithreaded: Whether the daemon uses threads @type console_logging: boolean @param console_logging: if True, the daemon will fall back to the system console if logging fails @type default_ssl_cert: string @param default_ssl_cert: Default SSL certificate path @type default_ssl_key: string @param default_ssl_key: Default SSL key path """ optionparser.add_option("-f", "--foreground", dest="fork", help="Don't detach from the current terminal", default=True, action="store_false") optionparser.add_option("-d", "--debug", dest="debug", help="Enable some debug messages", default=False, action="store_true") optionparser.add_option("--syslog", dest="syslog", help="Enable logging to syslog (except debug" " messages); one of 'no', 'yes' or 'only' [%s]" % constants.SYSLOG_USAGE, default=constants.SYSLOG_USAGE, choices=["no", "yes", "only"]) family = ssconf.SimpleStore().GetPrimaryIPFamily() # family will default to AF_INET if there is no ssconf file (e.g. when # upgrading a cluster from 2.2 -> 2.3. This is intended, as Ganeti clusters # <= 2.2 can not be AF_INET6 if daemon_name in constants.DAEMONS_PORTS: default_bind_address = constants.IP4_ADDRESS_ANY if family == netutils.IP6Address.family: default_bind_address = constants.IP6_ADDRESS_ANY default_port = netutils.GetDaemonPort(daemon_name) # For networked daemons we allow choosing the port and bind address optionparser.add_option("-p", "--port", dest="port", help="Network port (default: %s)" % default_port, default=default_port, type="int") optionparser.add_option("-b", "--bind", dest="bind_address", help=("Bind address (default: '%s')" % default_bind_address), default=default_bind_address, metavar="ADDRESS") optionparser.add_option("-i", "--interface", dest="bind_interface", help=("Bind interface"), metavar="INTERFACE") if default_ssl_key is not None and default_ssl_cert is not None: optionparser.add_option("--no-ssl", dest="ssl", help="Do not secure HTTP protocol with SSL", default=True, action="store_false") optionparser.add_option("-K", "--ssl-key", dest="ssl_key", help=("SSL key path (default: %s)" % default_ssl_key), default=default_ssl_key, type="string", metavar="SSL_KEY_PATH") optionparser.add_option("-C", "--ssl-cert", dest="ssl_cert", help=("SSL certificate path (default: %s)" % default_ssl_cert), default=default_ssl_cert, type="string", metavar="SSL_CERT_PATH") # Disable the use of fork(2) if the daemon uses threads if multithreaded: utils.DisableFork() options, args = optionparser.parse_args() if getattr(options, "bind_interface", None) is not None: if options.bind_address != default_bind_address: msg = ( "Can't specify both, bind address (%s) and bind interface (%s)" % (options.bind_address, options.bind_interface)) print >> sys.stderr, msg sys.exit(constants.EXIT_FAILURE) interface_ip_addresses = \ netutils.GetInterfaceIpAddresses(options.bind_interface) if family == netutils.IP6Address.family: if_addresses = interface_ip_addresses[constants.IP6_VERSION] else: if_addresses = interface_ip_addresses[constants.IP4_VERSION] if len(if_addresses) < 1: msg = "Failed to find IP for interface %s" % options.bind_interace print >> sys.stderr, msg sys.exit(constants.EXIT_FAILURE) options.bind_address = if_addresses[0] if getattr(options, "ssl", False): ssl_paths = { "certificate": options.ssl_cert, "key": options.ssl_key, } for name, path in ssl_paths.iteritems(): if not os.path.isfile(path): print >> sys.stderr, "SSL %s file '%s' was not found" % (name, path) sys.exit(constants.EXIT_FAILURE) # TODO: By initiating http.HttpSslParams here we would only read the files # once and have a proper validation (isfile returns False on directories) # at the same time. result, running_uid, expected_uid = _VerifyDaemonUser(daemon_name) if not result: msg = ("%s started using wrong user ID (%d), expected %d" % (daemon_name, running_uid, expected_uid)) print >> sys.stderr, msg sys.exit(constants.EXIT_FAILURE) if check_fn is not None: check_fn(options, args) log_filename = constants.DAEMONS_LOGFILES[daemon_name] if options.fork: utils.CloseFDs() (wpipe, stdio_reopen_fn) = utils.Daemonize(logfile=log_filename) else: (wpipe, stdio_reopen_fn) = (None, None) log_reopen_fn = \ utils.SetupLogging(log_filename, daemon_name, debug=options.debug, stderr_logging=not options.fork, multithreaded=multithreaded, syslog=options.syslog, console_logging=console_logging) # Reopen log file(s) on SIGHUP signal.signal( signal.SIGHUP, compat.partial(_HandleSigHup, [log_reopen_fn, stdio_reopen_fn])) try: utils.WritePidFile(utils.DaemonPidFileName(daemon_name)) except errors.PidFileLockError, err: print >> sys.stderr, "Error while locking PID file:\n%s" % err sys.exit(constants.EXIT_FAILURE)