Exemplo n.º 1
0
def SetupNodeDaemon(opts, cluster_name, node, ssh_port):
  """Add a node to the cluster.

  This function must be called before the actual opcode, and will ssh
  to the remote node, copy the needed files, and start ganeti-noded,
  allowing the master to do the rest via normal rpc calls.

  @param cluster_name: the cluster name
  @param node: the name of the new node
  @param ssh_port: the SSH port of the new node

  """
  data = {
    constants.NDS_CLUSTER_NAME: cluster_name,
    constants.NDS_NODE_DAEMON_CERTIFICATE:
      utils.ReadFile(pathutils.NODED_CERT_FILE),
    constants.NDS_HMAC:
      utils.ReadFile(pathutils.CONFD_HMAC_KEY),
    constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
    constants.NDS_START_NODE_DAEMON: True,
    constants.NDS_NODE_NAME: node,
    }

  ssh.RunSshCmdWithStdin(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
                         ssh_port, data,
                         debug=opts.debug, verbose=opts.verbose,
                         use_cluster_key=True, ask_key=opts.ssh_key_check,
                         strict_host_check=opts.ssh_key_check,
                         ensure_version=True)

  _WaitForSshDaemon(node, ssh_port)
  _WaitForNodeDaemon(node)
Exemplo n.º 2
0
def SetupNodeDaemon(opts, cluster_name, node, ssh_port):
  """Add a node to the cluster.

  This function must be called before the actual opcode, and will ssh
  to the remote node, copy the needed files, and start ganeti-noded,
  allowing the master to do the rest via normal rpc calls.

  @param cluster_name: the cluster name
  @param node: the name of the new node
  @param ssh_port: the SSH port of the new node

  """
  data = {
    constants.NDS_CLUSTER_NAME: cluster_name,
    constants.NDS_NODE_DAEMON_CERTIFICATE:
      utils.ReadFile(pathutils.NODED_CERT_FILE),
    constants.NDS_SSCONF: ssconf.SimpleStore().ReadAll(),
    constants.NDS_START_NODE_DAEMON: True,
    }

  RunNodeSetupCmd(cluster_name, node, pathutils.NODE_DAEMON_SETUP,
                  opts.debug, opts.verbose,
                  True, opts.ssh_key_check, opts.ssh_key_check,
                  ssh_port, data)

  _WaitForNodeDaemon(node)
Exemplo n.º 3
0
    def __init__(self,
                 server_addr,
                 port,
                 volume,
                 _run_cmd=utils.RunCmd,
                 _mount_point=None):
        """Creates a Gluster volume object.

    @type server_addr: str
    @param server_addr: The address to connect to

    @type port: int
    @param port: The port to connect to (Gluster standard is 24007)

    @type volume: str
    @param volume: The gluster volume to use for storage.

    """
        self.server_addr = server_addr
        server_ip = netutils.Hostname.GetIP(self.server_addr)
        self._server_ip = server_ip
        port = netutils.ValidatePortNumber(port)
        self._port = port
        self._volume = volume
        if _mount_point:  # tests
            self.mount_point = _mount_point
        else:
            self.mount_point = ssconf.SimpleStore().GetGlusterStorageDir()

        self._run_cmd = _run_cmd
Exemplo n.º 4
0
 def _Connect(sock, address, timeout, allow_non_master):
   sock.settimeout(timeout)
   try:
     sock.connect(address)
   except socket.timeout as err:
     raise errors.TimeoutError("Connect timed out: %s" % str(err))
   except socket.error as err:
     error_code = err.args[0]
     if error_code in (errno.ENOENT, errno.ECONNREFUSED):
       if not allow_non_master:
         # Verify if we're actually on the master node before trying
         # again.
         ss = ssconf.SimpleStore()
         try:
           master, myself = ssconf.GetMasterAndMyself(ss=ss)
         except ganeti.errors.ConfigurationError:
           raise errors.NoMasterError(address)
         if master != myself:
           raise errors.NoMasterError(address)
       raise utils.RetryAgain()
     elif error_code in (errno.EPERM, errno.EACCES):
       raise errors.PermissionError(address)
     elif error_code == errno.EAGAIN:
       # Server's socket backlog is full at the moment
       raise utils.RetryAgain()
     raise
Exemplo n.º 5
0
    def GET(self):
        """Returns a list of tags.

    Example: ["tag1", "tag2", "tag3"]

    """
        kind = self.TAG_LEVEL

        if kind in (constants.TAG_INSTANCE, constants.TAG_NODEGROUP,
                    constants.TAG_NODE, constants.TAG_NETWORK):
            if not self.name:
                raise http.HttpBadRequest("Missing name on tag request")

            cl = self.GetClient()
            tags = list(cl.QueryTags(kind, self.name))

        elif kind == constants.TAG_CLUSTER:
            assert not self.name
            # TODO: Use query API?
            ssc = ssconf.SimpleStore()
            tags = ssc.GetClusterTags()

        else:
            raise http.HttpBadRequest("Unhandled tag type!")

        return list(tags)
Exemplo n.º 6
0
def GetClient():
    """Connects to the a luxi socket and returns a client.

  """
    try:
        client = luxi.Client(address=pathutils.QUERY_SOCKET)
    except NoMasterError:
        ss = ssconf.SimpleStore()

        # Try to read ssconf file
        try:
            ss.GetMasterNode()
        except errors.ConfigurationError:
            raise errors.OpPrereqError(
                "Cluster not initialized or this machine is"
                " not part of a cluster", errors.ECODE_INVAL)

        master, myself = ssconf.GetMasterAndMyself(ss=ss)
        if master != myself:
            raise errors.OpPrereqError(
                "This is not the master node, please connect"
                " to node '%s' and rerun the command" % master,
                errors.ECODE_INVAL)
        raise
    return client
Exemplo n.º 7
0
    def setUp(self):
        self._tmpdir = tempfile.mkdtemp()
        self.ssdir = utils.PathJoin(self._tmpdir, "files")
        lockfile = utils.PathJoin(self._tmpdir, "lock")

        os.mkdir(self.ssdir)

        self.sstore = ssconf.SimpleStore(cfg_location=self.ssdir,
                                         _lockfile=lockfile)
Exemplo n.º 8
0
    def GetRunningInstances():
        """Compute list of hypervisor/running instances.

    """
        hyp_list = ssconf.SimpleStore().GetHypervisorList()
        hvparams = ssconf.SimpleStore().GetHvparams()
        results = []
        for hv_name in hyp_list:
            try:
                hv = hypervisor.GetHypervisor(hv_name)
                ilist = hv.ListInstances(hvparams=hvparams)
                results.extend([(iname, hv_name) for iname in ilist])
            except:  # pylint: disable=W0702
                logging.error(
                    "Error while listing instances for hypervisor %s",
                    hv_name,
                    exc_info=True)
        return results
Exemplo n.º 9
0
    def __init__(self, cluster_name):
        """Initializes this class.

    @type cluster_name: str
    @param cluster_name: name of the cluster

    """
        self.cluster_name = cluster_name
        family = ssconf.SimpleStore().GetPrimaryIPFamily()
        self.ipv6 = (family == netutils.IP6Address.family)
Exemplo n.º 10
0
def Main():
    """Main routine.

  """
    opts = ParseOptions()

    utils.SetupToolLogging(opts.debug,
                           opts.verbose,
                           toolname=os.path.splitext(
                               os.path.basename(__file__))[0])

    try:
        # List of files to delete. Contains tuples consisting of the absolute path
        # and a boolean denoting whether a backup copy should be created before
        # deleting.
        clean_files = [
            (pathutils.CONFD_HMAC_KEY, True),
            (pathutils.CLUSTER_CONF_FILE, True),
            (pathutils.CLUSTER_DOMAIN_SECRET_FILE, True),
        ]
        clean_files.extend(map(lambda s: (s, True), pathutils.ALL_CERT_FILES))
        clean_files.extend(
            map(lambda s: (s, False),
                ssconf.SimpleStore().GetFileList()))

        if not opts.yes_do_it:
            cli.ToStderr(
                "Cleaning a node is irreversible. If you really want to"
                " clean this node, supply the --yes-do-it option.")
            return constants.EXIT_FAILURE

        logging.info("Stopping daemons")
        result = utils.RunCmd([pathutils.DAEMON_UTIL, "stop-all"],
                              interactive=True)
        if result.failed:
            raise Exception("Could not stop daemons, command '%s' failed: %s" %
                            (result.cmd, result.fail_reason))

        for (filename, backup) in clean_files:
            if os.path.exists(filename):
                if opts.backup and backup:
                    logging.info("Backing up %s", filename)
                    utils.CreateBackup(filename)

                logging.info("Removing %s", filename)
                utils.RemoveFile(filename)

        logging.info("Node successfully cleaned")
    except Exception, err:  # pylint: disable=W0703
        logging.debug("Caught unhandled exception", exc_info=True)

        (retcode, message) = cli.FormatError(err)
        logging.error(message)

        return retcode
Exemplo n.º 11
0
    def ShouldRun():
        """Checks whether node maintenance should run.

    """
        try:
            return ssconf.SimpleStore().GetMaintainNodeHealth()
        except errors.ConfigurationError as err:
            logging.error(
                "Configuration error, not activating node maintenance: %s",
                err)
            return False
Exemplo n.º 12
0
def _LoadKnownGroups():
    """Returns a list of all node groups known by L{ssconf}.

  """
    groups = ssconf.SimpleStore().GetNodegroupList()

    result = list(line.split(None, 1)[0] for line in groups if line.strip())

    if not compat.all(utils.UUID_RE.match(r) for r in result):
        raise errors.GenericError("Ssconf contains invalid group UUID")

    return result
Exemplo n.º 13
0
def MajorityHealthy(ignore_offline_nodes=False):
    """Check if the majority of nodes is healthy

  Gather master votes from all nodes known to this node;
  return True if a strict majority of nodes is reachable and
  has some opinion on which node is master. Note that this will
  not guarantee any node to win an election but it ensures that
  a standard master-failover is still possible.

  @return: tuple of (boolean, [str]); the first is if a majority of nodes are
    healthy, the second is a list of the node names that are not considered
    healthy.
  """
    if ignore_offline_nodes:
        node_names = ssconf.SimpleStore().GetOnlineNodeList()
    else:
        node_names = ssconf.SimpleStore().GetNodeList()

    node_count = len(node_names)
    vote_list = _GatherMasterVotes(node_names)

    if not vote_list:
        logging.warning(
            ('Voting list was None; cannot determine if a majority of '
             'nodes are healthy'))
        return (False, node_names)

    total_votes = sum(
        [count for (node, count) in vote_list if node is not None])
    majority_healthy = 2 * total_votes > node_count

    # The list of nodes that did not vote is calculated to provide useful
    # debugging information to the client.
    voting_nodes = [node for (node, _) in vote_list]
    nonvoting_nodes = [node for node in node_names if node not in voting_nodes]

    logging.info("Total %d nodes, %d votes: %s", node_count, total_votes,
                 vote_list)

    return (majority_healthy, nonvoting_nodes)
Exemplo n.º 14
0
def GetMaster():
    """Returns the current master node.

  This is a separate function in bootstrap since it's needed by
  gnt-cluster, and instead of importing directly ssconf, it's better
  to abstract it in bootstrap, where we do use ssconf in other
  functions too.

  """
    sstore = ssconf.SimpleStore()

    old_master, _ = ssconf.GetMasterAndMyself(sstore)

    return old_master
Exemplo n.º 15
0
def GetConfdClient(callback):
    """Return a client configured using the given callback.

  This is handy to abstract the MC list and HMAC key reading.

  @attention: This should only be called on nodes which are part of a
      cluster, since it depends on a valid (ganeti) data directory;
      for code running outside of a cluster, you need to create the
      client manually

  """
    ss = ssconf.SimpleStore()
    mc_file = ss.KeyToFilename(constants.SS_MASTER_CANDIDATES_IPS)
    mc_list = utils.ReadFile(mc_file).splitlines()
    hmac_key = utils.ReadFile(pathutils.CONFD_HMAC_KEY)
    return ConfdClient(hmac_key, mc_list, callback)
Exemplo n.º 16
0
def SSLVerifyPeer(conn, cert, errnum, errdepth, ok):
    """Callback function to verify a peer against the candidate cert map.

  Note that we have a chicken-and-egg problem during cluster init and upgrade.
  This method checks whether the incoming connection comes from a master
  candidate by comparing it to the master certificate map in the cluster
  configuration. However, during cluster init and cluster upgrade there
  are various RPC calls done to the master node itself, before the candidate
  certificate list is established and the cluster configuration is written.
  In this case, we cannot check against the master candidate map.

  This problem is solved by checking whether the candidate map is empty. An
  initialized 2.11 or higher cluster has at least one entry for the master
  node in the candidate map. If the map is empty, we know that we are still
  in the bootstrap/upgrade phase. In this case, we read the server certificate
  digest and compare it to the incoming request.

  This means that after an upgrade of Ganeti, the system continues to operate
  like before, using server certificates only. After the client certificates
  are generated with ``gnt-cluster renew-crypto --new-node-certificates``,
  RPC communication is switched to using client certificates and the trick of
  using server certificates does not work anymore.

  @type conn: C{OpenSSL.SSL.Connection}
  @param conn: the OpenSSL connection object
  @type cert: C{OpenSSL.X509}
  @param cert: the peer's SSL certificate

  """
    # some parameters are unused, but this is the API
    # pylint: disable=W0613
    _BOOTSTRAP = "bootstrap"
    sstore = ssconf.SimpleStore()
    try:
        candidate_certs = sstore.GetMasterCandidatesCertMap()
    except errors.ConfigurationError:
        logging.info("No candidate certificates found. Switching to "
                     "bootstrap/update mode.")
        candidate_certs = None
    if not candidate_certs:
        candidate_certs = {
            _BOOTSTRAP:
            utils.GetCertificateDigest(cert_filename=pathutils.NODED_CERT_FILE)
        }
    return cert.digest("sha1") in candidate_certs.values()
Exemplo n.º 17
0
def GetClient(query=True):
    """Connects to the a luxi socket and returns a client.

  @type query: boolean
  @param query: this signifies that the client will only be
      used for queries; if the build-time parameter
      enable-split-queries is enabled, then the client will be
      connected to the query socket instead of the masterd socket

  """
    override_socket = os.getenv(constants.LUXI_OVERRIDE, "")
    if override_socket:
        if override_socket == constants.LUXI_OVERRIDE_MASTER:
            address = pathutils.MASTER_SOCKET
        elif override_socket == constants.LUXI_OVERRIDE_QUERY:
            address = pathutils.QUERY_SOCKET
        else:
            address = override_socket
    elif query:
        address = pathutils.QUERY_SOCKET
    else:
        address = None
    # TODO: Cache object?
    try:
        client = luxi.Client(address=address)
    except NoMasterError:
        ss = ssconf.SimpleStore()

        # Try to read ssconf file
        try:
            ss.GetMasterNode()
        except errors.ConfigurationError:
            raise errors.OpPrereqError(
                "Cluster not initialized or this machine is"
                " not part of a cluster", errors.ECODE_INVAL)

        master, myself = ssconf.GetMasterAndMyself(ss=ss)
        if master != myself:
            raise errors.OpPrereqError(
                "This is not the master node, please connect"
                " to node '%s' and rerun the command" % master,
                errors.ECODE_INVAL)
        raise
    return client
Exemplo n.º 18
0
def MajorityHealthy():
  """Check if the majority of nodes is healthy

  Gather master votes from all nodes known to this node;
  return True if a strict majority of nodes is reachable and
  has some opinion on which node is master. Note that this will
  not guarantee any node to win an election but it ensures that
  a standard master-failover is still possible.

  """
  node_names = ssconf.SimpleStore().GetNodeList()
  node_count = len(node_names)
  vote_list = GatherMasterVotes(node_names)
  if vote_list is None:
    return False
  total_votes = sum([count for (node, count) in vote_list if node is not None])
  logging.info("Total %d nodes, %d votes: %s", node_count, total_votes,
               vote_list)
  return 2 * total_votes > node_count
Exemplo n.º 19
0
def _WaitForSshDaemon(hostname, port):
  """Wait for SSH daemon to become responsive.

  """
  family = ssconf.SimpleStore().GetPrimaryIPFamily()
  hostip = netutils.GetHostname(name=hostname, family=family).ip

  def _CheckSshDaemon():
    if netutils.TcpPing(hostip, port, timeout=1.0, live_port_needed=True):
      logging.debug("SSH daemon on %s:%s (IP address %s) has become"
                    " responsive", hostname, port, hostip)
    else:
      raise utils.RetryAgain()

  try:
    utils.Retry(_CheckSshDaemon, 1.0, _DAEMON_READY_TIMEOUT)
  except utils.RetryTimeout:
    raise errors.OpExecError("SSH daemon on %s:%s (IP address %s) didn't"
                             " become responsive within %s seconds" %
                             (hostname, port, hostip, _DAEMON_READY_TIMEOUT))
Exemplo n.º 20
0
def ComputeAncillaryFiles(cluster, redist):
    """Compute files external to Ganeti which need to be consistent.

  @type redist: boolean
  @param redist: Whether to include files which need to be redistributed

  """
    # Compute files for all nodes
    files_all = set([
        pathutils.SSH_KNOWN_HOSTS_FILE,
        pathutils.CONFD_HMAC_KEY,
        pathutils.CLUSTER_DOMAIN_SECRET_FILE,
        pathutils.SPICE_CERT_FILE,
        pathutils.SPICE_CACERT_FILE,
        pathutils.RAPI_USERS_FILE,
    ])

    if redist:
        # we need to ship at least the RAPI certificate
        files_all.add(pathutils.RAPI_CERT_FILE)
    else:
        files_all.update(pathutils.ALL_CERT_FILES)
        files_all.update(ssconf.SimpleStore().GetFileList())

    if cluster.modify_etc_hosts:
        files_all.add(pathutils.ETC_HOSTS)

    if cluster.use_external_mip_script:
        files_all.add(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)

    # Files which are optional, these must:
    # - be present in one other category as well
    # - either exist or not exist on all nodes of that category (mc, vm all)
    files_opt = set([
        pathutils.RAPI_USERS_FILE,
    ])

    # Files which should only be on master candidates
    files_mc = set()

    if not redist:
        files_mc.add(pathutils.CLUSTER_CONF_FILE)

    # File storage
    if (not redist and (cluster.IsFileStorageEnabled()
                        or cluster.IsSharedFileStorageEnabled())):
        files_all.add(pathutils.FILE_STORAGE_PATHS_FILE)
        files_opt.add(pathutils.FILE_STORAGE_PATHS_FILE)

    # Files which should only be on VM-capable nodes
    files_vm = set(filename for hv_name in cluster.enabled_hypervisors
                   for filename in hypervisor.GetHypervisorClass(
                       hv_name).GetAncillaryFiles()[0])

    files_opt |= set(filename for hv_name in cluster.enabled_hypervisors
                     for filename in hypervisor.GetHypervisorClass(
                         hv_name).GetAncillaryFiles()[1])

    # Filenames in each category must be unique
    all_files_set = files_all | files_mc | files_vm
    assert (len(all_files_set) ==
            sum(map(len, [files_all, files_mc, files_vm]))), \
      "Found file listed in more than one file list"

    # Optional files must be present in one other category
    assert all_files_set.issuperset(files_opt), \
      "Optional file not in a different required list"

    # This one file should never ever be re-distributed via RPC
    assert not (redist and pathutils.FILE_STORAGE_PATHS_FILE in all_files_set)

    return (files_all, files_opt, files_mc, files_vm)
Exemplo n.º 21
0
def MasterFailover(no_voting=False):
    """Failover the master node.

  This checks that we are not already the master, and will cause the
  current master to cease being master, and the non-master to become
  new master.

  @type no_voting: boolean
  @param no_voting: force the operation without remote nodes agreement
                      (dangerous)

  @returns: the pair of an exit code and warnings to display
  """
    sstore = ssconf.SimpleStore()

    old_master, new_master = ssconf.GetMasterAndMyself(sstore)
    node_names = sstore.GetNodeList()
    mc_list = sstore.GetMasterCandidates()

    if old_master == new_master:
        raise errors.OpPrereqError(
            "This commands must be run on the node"
            " where you want the new master to be."
            " %s is already the master" % old_master, errors.ECODE_INVAL)

    if new_master not in mc_list:
        mc_no_master = [name for name in mc_list if name != old_master]
        raise errors.OpPrereqError(
            "This node is not among the nodes marked"
            " as master candidates. Only these nodes"
            " can become masters. Current list of"
            " master candidates is:\n"
            "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE)

    if not no_voting:
        vote_list = GatherMasterVotes(node_names)

        if vote_list:
            voted_master = vote_list[0][0]
            if voted_master is None:
                raise errors.OpPrereqError(
                    "Cluster is inconsistent, most nodes did"
                    " not respond.", errors.ECODE_ENVIRON)
            elif voted_master != old_master:
                raise errors.OpPrereqError(
                    "I have a wrong configuration, I believe"
                    " the master is %s but the other nodes"
                    " voted %s. Please resync the configuration"
                    " of this node." % (old_master, voted_master),
                    errors.ECODE_STATE)
    # end checks

    rcode = 0
    warnings = []

    logging.info("Setting master to %s, old master: %s", new_master,
                 old_master)

    try:
        # Forcefully start WConfd so that we can access the configuration
        result = utils.RunCmd([
            pathutils.DAEMON_UTIL, "start", constants.WCONFD, "--force-node",
            "--no-voting", "--yes-do-it"
        ])
        if result.failed:
            raise errors.OpPrereqError(
                "Could not start the configuration daemon,"
                " command %s had exitcode %s and error %s" %
                (result.cmd, result.exit_code, result.output),
                errors.ECODE_NOENT)

        # instantiate a real config writer, as we now know we have the
        # configuration data
        livelock = utils.livelock.LiveLock("bootstrap_failover")
        cfg = config.GetConfig(None, livelock, accept_foreign=True)

        old_master_node = cfg.GetNodeInfoByName(old_master)
        if old_master_node is None:
            raise errors.OpPrereqError(
                "Could not find old master node '%s' in"
                " cluster configuration." % old_master, errors.ECODE_NOENT)

        cluster_info = cfg.GetClusterInfo()
        new_master_node = cfg.GetNodeInfoByName(new_master)
        if new_master_node is None:
            raise errors.OpPrereqError(
                "Could not find new master node '%s' in"
                " cluster configuration." % new_master, errors.ECODE_NOENT)

        cluster_info.master_node = new_master_node.uuid
        # this will also regenerate the ssconf files, since we updated the
        # cluster info
        cfg.Update(cluster_info, logging.error)

        # if cfg.Update worked, then it means the old master daemon won't be
        # able now to write its own config file (we rely on locking in both
        # backend.UploadFile() and ConfigWriter._Write(); hence the next
        # step is to kill the old master

        logging.info("Stopping the master daemon on node %s", old_master)

        runner = rpc.BootstrapRunner()
        master_params = cfg.GetMasterNetworkParameters()
        master_params.uuid = old_master_node.uuid
        ems = cfg.GetUseExternalMipScript()
        result = runner.call_node_deactivate_master_ip(old_master,
                                                       master_params, ems)

        msg = result.fail_msg
        if msg:
            warning = "Could not disable the master IP: %s" % (msg, )
            logging.warning("%s", warning)
            warnings.append(warning)

        result = runner.call_node_stop_master(old_master)
        msg = result.fail_msg
        if msg:
            warning = ("Could not disable the master role on the old master"
                       " %s, please disable manually: %s" % (old_master, msg))
            logging.error("%s", warning)
            warnings.append(warning)
    except errors.ConfigurationError, err:
        logging.error("Error while trying to set the new master: %s", str(err))
        return 1, warnings
Exemplo n.º 22
0
def GetPaths():
    """Returns a tuple of path objects to process.

  """
    getent = runtime.GetEnts()
    masterd_log = constants.DAEMONS_LOGFILES[constants.MASTERD]
    noded_log = constants.DAEMONS_LOGFILES[constants.NODED]
    confd_log = constants.DAEMONS_LOGFILES[constants.CONFD]
    luxid_log = constants.DAEMONS_LOGFILES[constants.LUXID]
    rapi_log = constants.DAEMONS_LOGFILES[constants.RAPI]
    mond_log = constants.DAEMONS_LOGFILES[constants.MOND]

    rapi_dir = os.path.join(pathutils.DATA_DIR, "rapi")
    cleaner_log_dir = os.path.join(pathutils.LOG_DIR, "cleaner")
    master_cleaner_log_dir = os.path.join(pathutils.LOG_DIR, "master-cleaner")

    # A note on the ordering: The parent directory (type C{DIR}) must always be
    # listed before files (type C{FILE}) in that directory. Once the directory is
    # set, only files directly in that directory can be listed.
    paths = [
        (pathutils.DATA_DIR, DIR, 0755, getent.masterd_uid,
         getent.masterd_gid),
        (pathutils.CLUSTER_DOMAIN_SECRET_FILE, FILE, 0640, getent.masterd_uid,
         getent.masterd_gid, False),
        (pathutils.CLUSTER_CONF_FILE, FILE, 0640, getent.masterd_uid,
         getent.confd_gid, False),
        (pathutils.CONFD_HMAC_KEY, FILE, 0440, getent.confd_uid,
         getent.masterd_gid, False),
        (pathutils.SSH_KNOWN_HOSTS_FILE, FILE, 0644, getent.masterd_uid,
         getent.masterd_gid, False),
        (pathutils.RAPI_CERT_FILE, FILE, 0440, getent.rapi_uid,
         getent.masterd_gid, False),
        (pathutils.SPICE_CERT_FILE, FILE, 0440, getent.noded_uid,
         getent.masterd_gid, False),
        (pathutils.SPICE_CACERT_FILE, FILE, 0440, getent.noded_uid,
         getent.masterd_gid, False),
        (pathutils.NODED_CERT_FILE, FILE, pathutils.NODED_CERT_MODE,
         getent.masterd_uid, getent.masterd_gid, False),
        (pathutils.NODED_CLIENT_CERT_FILE, FILE, pathutils.NODED_CERT_MODE,
         getent.masterd_uid, getent.masterd_gid, False),
        (pathutils.WATCHER_PAUSEFILE, FILE, 0644, getent.masterd_uid,
         getent.masterd_gid, False),
    ]

    ss = ssconf.SimpleStore()
    for ss_path in ss.GetFileList():
        paths.append((ss_path, FILE, constants.SS_FILE_PERMS, getent.noded_uid,
                      getent.noded_gid, False))

    paths.extend([
        (pathutils.QUEUE_DIR, DIR, 0750, getent.masterd_uid,
         getent.daemons_gid),
        (pathutils.QUEUE_DIR, QUEUE_DIR, constants.JOB_QUEUE_FILES_PERMS,
         getent.masterd_uid, getent.daemons_gid),
        (pathutils.JOB_QUEUE_DRAIN_FILE, FILE, 0644, getent.masterd_uid,
         getent.daemons_gid, False),
        (pathutils.JOB_QUEUE_LOCK_FILE, FILE, constants.JOB_QUEUE_FILES_PERMS,
         getent.masterd_uid, getent.daemons_gid, False),
        (pathutils.JOB_QUEUE_SERIAL_FILE, FILE,
         constants.JOB_QUEUE_FILES_PERMS, getent.masterd_uid,
         getent.daemons_gid, False),
        (pathutils.JOB_QUEUE_VERSION_FILE, FILE,
         constants.JOB_QUEUE_FILES_PERMS, getent.masterd_uid,
         getent.daemons_gid, False),
        (pathutils.JOB_QUEUE_ARCHIVE_DIR, DIR, 0750, getent.masterd_uid,
         getent.daemons_gid),
        (rapi_dir, DIR, 0750, getent.rapi_uid, getent.masterd_gid),
        (pathutils.RAPI_USERS_FILE, FILE, 0640, getent.rapi_uid,
         getent.masterd_gid, False),
        (pathutils.RUN_DIR, DIR, 0775, getent.masterd_uid, getent.daemons_gid),
        (pathutils.SOCKET_DIR, DIR, 0770, getent.masterd_uid,
         getent.daemons_gid),
        (pathutils.MASTER_SOCKET, FILE, 0660, getent.masterd_uid,
         getent.daemons_gid, False),
        (pathutils.QUERY_SOCKET, FILE, 0660, getent.luxid_uid,
         getent.daemons_gid, False),
        (pathutils.BDEV_CACHE_DIR, DIR, 0755, getent.noded_uid,
         getent.masterd_gid),
        (pathutils.UIDPOOL_LOCKDIR, DIR, 0750, getent.noded_uid,
         getent.masterd_gid),
        (pathutils.DISK_LINKS_DIR, DIR, 0755, getent.noded_uid,
         getent.masterd_gid),
        (pathutils.CRYPTO_KEYS_DIR, DIR, 0700, getent.noded_uid,
         getent.masterd_gid),
        (pathutils.IMPORT_EXPORT_DIR, DIR, 0755, getent.noded_uid,
         getent.masterd_gid),
        (pathutils.LOG_DIR, DIR, 0770, getent.masterd_uid, getent.daemons_gid),
        (masterd_log, FILE, 0600, getent.masterd_uid, getent.masterd_gid,
         False),
        (confd_log, FILE, 0600, getent.confd_uid, getent.masterd_gid, False),
        (luxid_log, FILE, 0600, getent.luxid_uid, getent.masterd_gid, False),
        (noded_log, FILE, 0600, getent.noded_uid, getent.masterd_gid, False),
        (rapi_log, FILE, 0600, getent.rapi_uid, getent.masterd_gid, False),
        (mond_log, FILE, 0600, getent.mond_uid, getent.masterd_gid, False),
        (pathutils.LOG_OS_DIR, DIR, 0750, getent.noded_uid,
         getent.daemons_gid),
        (pathutils.LOG_XEN_DIR, DIR, 0750, getent.noded_uid,
         getent.daemons_gid),
        (cleaner_log_dir, DIR, 0750, getent.noded_uid, getent.noded_gid),
        (master_cleaner_log_dir, DIR, 0750, getent.masterd_uid,
         getent.masterd_gid),
        (pathutils.INSTANCE_REASON_DIR, DIR, 0755, getent.noded_uid,
         getent.noded_gid),
        (pathutils.LIVELOCK_DIR, DIR, 0750, getent.masterd_uid,
         getent.daemons_gid),
        (pathutils.LUXID_MESSAGE_DIR, DIR, 0750, getent.masterd_uid,
         getent.daemons_gid),
    ])

    return paths
Exemplo n.º 23
0
def MasterFailover(no_voting=False):
    """Failover the master node.

  This checks that we are not already the master, and will cause the
  current master to cease being master, and the non-master to become
  new master.

  Note: The call to MasterFailover from lib/client/gnt_cluster.py checks that
  a majority of nodes are healthy and responding before calling this. If this
  function is called from somewhere else, the caller should also verify that a
  majority of nodes are healthy.

  @type no_voting: boolean
  @param no_voting: force the operation without remote nodes agreement
                      (dangerous)

  @returns: the pair of an exit code and warnings to display
  """
    sstore = ssconf.SimpleStore()

    old_master, new_master = ssconf.GetMasterAndMyself(sstore)
    node_names = sstore.GetNodeList()
    mc_list = sstore.GetMasterCandidates()

    if old_master == new_master:
        raise errors.OpPrereqError(
            "This commands must be run on the node"
            " where you want the new master to be."
            " %s is already the master" % old_master, errors.ECODE_INVAL)

    if new_master not in mc_list:
        mc_no_master = [name for name in mc_list if name != old_master]
        raise errors.OpPrereqError(
            "This node is not among the nodes marked"
            " as master candidates. Only these nodes"
            " can become masters. Current list of"
            " master candidates is:\n"
            "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE)

    if not no_voting:
        vote_list = _GatherMasterVotes(node_names)
        if vote_list:
            voted_master = vote_list[0][0]
            if voted_master != old_master:
                raise errors.OpPrereqError(
                    "I have a wrong configuration, I believe"
                    " the master is %s but the other nodes"
                    " voted %s. Please resync the configuration"
                    " of this node." % (old_master, voted_master),
                    errors.ECODE_STATE)
    # end checks

    rcode = 0
    warnings = []

    logging.info("Setting master to %s, old master: %s", new_master,
                 old_master)

    try:
        # Forcefully start WConfd so that we can access the configuration
        result = utils.RunCmd([
            pathutils.DAEMON_UTIL, "start", constants.WCONFD, "--force-node",
            "--no-voting", "--yes-do-it"
        ])
        if result.failed:
            raise errors.OpPrereqError(
                "Could not start the configuration daemon,"
                " command %s had exitcode %s and error %s" %
                (result.cmd, result.exit_code, result.output),
                errors.ECODE_NOENT)

        # instantiate a real config writer, as we now know we have the
        # configuration data
        livelock = utils.livelock.LiveLock("bootstrap_failover")
        cfg = config.GetConfig(None, livelock, accept_foreign=True)

        old_master_node = cfg.GetNodeInfoByName(old_master)
        if old_master_node is None:
            raise errors.OpPrereqError(
                "Could not find old master node '%s' in"
                " cluster configuration." % old_master, errors.ECODE_NOENT)

        cluster_info = cfg.GetClusterInfo()
        new_master_node = cfg.GetNodeInfoByName(new_master)
        if new_master_node is None:
            raise errors.OpPrereqError(
                "Could not find new master node '%s' in"
                " cluster configuration." % new_master, errors.ECODE_NOENT)

        cluster_info.master_node = new_master_node.uuid
        # this will also regenerate the ssconf files, since we updated the
        # cluster info
        cfg.Update(cluster_info, logging.error)

        # if cfg.Update worked, then it means the old master daemon won't be
        # able now to write its own config file (we rely on locking in both
        # backend.UploadFile() and ConfigWriter._Write(); hence the next
        # step is to kill the old master

        logging.info("Stopping the master daemon on node %s", old_master)

        runner = rpc.BootstrapRunner()
        master_params = cfg.GetMasterNetworkParameters()
        master_params.uuid = old_master_node.uuid
        ems = cfg.GetUseExternalMipScript()
        result = runner.call_node_deactivate_master_ip(old_master,
                                                       master_params, ems)

        msg = result.fail_msg
        if msg:
            warning = "Could not disable the master IP: %s" % (msg, )
            logging.warning("%s", warning)
            warnings.append(warning)

        result = runner.call_node_stop_master(old_master)
        msg = result.fail_msg
        if msg:
            warning = ("Could not disable the master role on the old master"
                       " %s, please disable manually: %s" % (old_master, msg))
            logging.error("%s", warning)
            warnings.append(warning)
    except errors.ConfigurationError as err:
        logging.error("Error while trying to set the new master: %s", str(err))
        return 1, warnings
    finally:
        # stop WConfd again:
        result = utils.RunCmd(
            [pathutils.DAEMON_UTIL, "stop", constants.WCONFD])
        if result.failed:
            warning = ("Could not stop the configuration daemon,"
                       " command %s had exitcode %s and error %s" %
                       (result.cmd, result.exit_code, result.output))
            logging.error("%s", warning)
            rcode = 1

    logging.info("Checking master IP non-reachability...")

    master_ip = sstore.GetMasterIP()
    total_timeout = 30

    # Here we have a phase where no master should be running
    def _check_ip(expected):
        if netutils.TcpPing(master_ip,
                            constants.DEFAULT_NODED_PORT) != expected:
            raise utils.RetryAgain()

    try:
        utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False])
    except utils.RetryTimeout:
        warning = ("The master IP is still reachable after %s seconds,"
                   " continuing but activating the master IP on the current"
                   " node will probably fail" % total_timeout)
        logging.warning("%s", warning)
        warnings.append(warning)
        rcode = 1

    if jstore.CheckDrainFlag():
        logging.info("Undraining job queue")
        jstore.SetDrainFlag(False)

    logging.info("Starting the master daemons on the new master")

    result = rpc.BootstrapRunner().call_node_start_master_daemons(
        new_master, no_voting)
    msg = result.fail_msg
    if msg:
        logging.error(
            "Could not start the master role on the new master"
            " %s, please check: %s", new_master, msg)
        rcode = 1

    # Finally verify that the new master managed to set up the master IP
    # and warn if it didn't.
    try:
        utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[True])
    except utils.RetryTimeout:
        warning = ("The master IP did not come up within %s seconds; the"
                   " cluster should still be working and reachable via %s,"
                   " but not via the master IP address" %
                   (total_timeout, new_master))
        logging.warning("%s", warning)
        warnings.append(warning)
        rcode = 1

    logging.info("Master failed over from %s to %s", old_master, new_master)
    return rcode, warnings
Exemplo n.º 24
0
def MasterFailover(no_voting=False):
    """Failover the master node.

  This checks that we are not already the master, and will cause the
  current master to cease being master, and the non-master to become
  new master.

  @type no_voting: boolean
  @param no_voting: force the operation without remote nodes agreement
                      (dangerous)

  """
    sstore = ssconf.SimpleStore()

    old_master, new_master = ssconf.GetMasterAndMyself(sstore)
    node_names = sstore.GetNodeList()
    mc_list = sstore.GetMasterCandidates()

    if old_master == new_master:
        raise errors.OpPrereqError(
            "This commands must be run on the node"
            " where you want the new master to be."
            " %s is already the master" % old_master, errors.ECODE_INVAL)

    if new_master not in mc_list:
        mc_no_master = [name for name in mc_list if name != old_master]
        raise errors.OpPrereqError(
            "This node is not among the nodes marked"
            " as master candidates. Only these nodes"
            " can become masters. Current list of"
            " master candidates is:\n"
            "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE)

    if not no_voting:
        vote_list = GatherMasterVotes(node_names)

        if vote_list:
            voted_master = vote_list[0][0]
            if voted_master is None:
                raise errors.OpPrereqError(
                    "Cluster is inconsistent, most nodes did"
                    " not respond.", errors.ECODE_ENVIRON)
            elif voted_master != old_master:
                raise errors.OpPrereqError(
                    "I have a wrong configuration, I believe"
                    " the master is %s but the other nodes"
                    " voted %s. Please resync the configuration"
                    " of this node." % (old_master, voted_master),
                    errors.ECODE_STATE)
    # end checks

    rcode = 0

    logging.info("Setting master to %s, old master: %s", new_master,
                 old_master)

    try:
        # instantiate a real config writer, as we now know we have the
        # configuration data
        cfg = config.ConfigWriter(accept_foreign=True)

        old_master_node = cfg.GetNodeInfoByName(old_master)
        if old_master_node is None:
            raise errors.OpPrereqError(
                "Could not find old master node '%s' in"
                " cluster configuration." % old_master, errors.ECODE_NOENT)

        cluster_info = cfg.GetClusterInfo()
        new_master_node = cfg.GetNodeInfoByName(new_master)
        if new_master_node is None:
            raise errors.OpPrereqError(
                "Could not find new master node '%s' in"
                " cluster configuration." % new_master, errors.ECODE_NOENT)

        cluster_info.master_node = new_master_node.uuid
        # this will also regenerate the ssconf files, since we updated the
        # cluster info
        cfg.Update(cluster_info, logging.error)
    except errors.ConfigurationError, err:
        logging.error("Error while trying to set the new master: %s", str(err))
        return 1
Exemplo n.º 25
0
def SSLVerifyPeer(conn, cert, errnum, errdepth, ok):
    """Callback function to verify a peer against the candidate cert map.

  Note that we have a chicken-and-egg problem during cluster init and upgrade.
  This method checks whether the incoming connection comes from a master
  candidate by comparing it to the master certificate map in the cluster
  configuration. However, during cluster init and cluster upgrade there
  are various RPC calls done to the master node itself, before the candidate
  certificate list is established and the cluster configuration is written.
  In this case, we cannot check against the master candidate map.

  This problem is solved by checking whether the candidate map is empty. An
  initialized 2.11 or higher cluster has at least one entry for the master
  node in the candidate map. If the map is empty, we know that we are still
  in the bootstrap/upgrade phase. In this case, we read the server certificate
  digest and compare it to the incoming request.

  This means that after an upgrade of Ganeti, the system continues to operate
  like before, using server certificates only. After the client certificates
  are generated with ``gnt-cluster renew-crypto --new-node-certificates``,
  RPC communication is switched to using client certificates and the trick of
  using server certificates does not work anymore.

  @type conn: C{OpenSSL.SSL.Connection}
  @param conn: the OpenSSL connection object
  @type cert: C{OpenSSL.X509}
  @param cert: the peer's SSL certificate
  @type errdepth: integer
  @param errdepth: number of the step in the certificate chain starting at 0
                   for the actual client certificate.

  """
    # some parameters are unused, but this is the API
    # pylint: disable=W0613

    # If we receive a certificate from the certificate chain that is higher
    # than the lowest element of the chain, we have to check it against the
    # server certificate.
    if errdepth > 0:
        server_digest = utils.GetCertificateDigest(
            cert_filename=pathutils.NODED_CERT_FILE)
        match = cert.digest("sha1") == server_digest
        if not match:
            logging.debug(
                "Received certificate from the certificate chain, which"
                " does not match the server certficate. Digest of the"
                " received certificate: %s. Digest of the server"
                " certificate: %s.", cert.digest("sha1"), server_digest)
        return match
    elif errdepth == 0:
        sstore = ssconf.SimpleStore()
        try:
            candidate_certs = sstore.GetMasterCandidatesCertMap()
        except errors.ConfigurationError:
            logging.info("No candidate certificates found. Switching to "
                         "bootstrap/update mode.")
            candidate_certs = None
        if not candidate_certs:
            candidate_certs = {
                constants.CRYPTO_BOOTSTRAP:
                utils.GetCertificateDigest(
                    cert_filename=pathutils.NODED_CERT_FILE)
            }
        match = cert.digest("sha1") in candidate_certs.values()
        if not match:
            logging.debug(
                "Received certificate which is not a certificate of a"
                " master candidate. Certificate digest: %s. List of master"
                " candidate certificate digests: %s.", cert.digest("sha1"),
                str(candidate_certs))
        return match
    else:
        logging.error("Invalid errdepth value: %s.", errdepth)
        return False
Exemplo n.º 26
0
def GenericMain(daemon_name,
                optionparser,
                check_fn,
                prepare_fn,
                exec_fn,
                multithreaded=False,
                console_logging=False,
                default_ssl_cert=None,
                default_ssl_key=None,
                warn_breach=False):
    """Shared main function for daemons.

  @type daemon_name: string
  @param daemon_name: daemon name
  @type optionparser: optparse.OptionParser
  @param optionparser: initialized optionparser with daemon-specific options
                       (common -f -d options will be handled by this module)
  @type check_fn: function which accepts (options, args)
  @param check_fn: function that checks start conditions and exits if they're
                   not met
  @type prepare_fn: function which accepts (options, args)
  @param prepare_fn: function that is run before forking, or None;
      it's result will be passed as the third parameter to exec_fn, or
      if None was passed in, we will just pass None to exec_fn
  @type exec_fn: function which accepts (options, args, prepare_results)
  @param exec_fn: function that's executed with the daemon's pid file held, and
                  runs the daemon itself.
  @type multithreaded: bool
  @param multithreaded: Whether the daemon uses threads
  @type console_logging: boolean
  @param console_logging: if True, the daemon will fall back to the system
                          console if logging fails
  @type default_ssl_cert: string
  @param default_ssl_cert: Default SSL certificate path
  @type default_ssl_key: string
  @param default_ssl_key: Default SSL key path
  @type warn_breach: bool
  @param warn_breach: issue a warning at daemon launch time, before
      daemonizing, about the possibility of breaking parameter privacy
      invariants through the otherwise helpful debug logging.

  """
    optionparser.add_option("-f",
                            "--foreground",
                            dest="fork",
                            help="Don't detach from the current terminal",
                            default=True,
                            action="store_false")
    optionparser.add_option("-d",
                            "--debug",
                            dest="debug",
                            help="Enable some debug messages",
                            default=False,
                            action="store_true")
    optionparser.add_option("--syslog",
                            dest="syslog",
                            help="Enable logging to syslog (except debug"
                            " messages); one of 'no', 'yes' or 'only' [%s]" %
                            constants.SYSLOG_USAGE,
                            default=constants.SYSLOG_USAGE,
                            choices=["no", "yes", "only"])

    family = ssconf.SimpleStore().GetPrimaryIPFamily()
    # family will default to AF_INET if there is no ssconf file (e.g. when
    # upgrading a cluster from 2.2 -> 2.3. This is intended, as Ganeti clusters
    # <= 2.2 can not be AF_INET6
    if daemon_name in constants.DAEMONS_PORTS:
        default_bind_address = constants.IP4_ADDRESS_ANY
        if family == netutils.IP6Address.family:
            default_bind_address = constants.IP6_ADDRESS_ANY

        default_port = netutils.GetDaemonPort(daemon_name)

        # For networked daemons we allow choosing the port and bind address
        optionparser.add_option("-p",
                                "--port",
                                dest="port",
                                help="Network port (default: %s)" %
                                default_port,
                                default=default_port,
                                type="int")
        optionparser.add_option("-b",
                                "--bind",
                                dest="bind_address",
                                help=("Bind address (default: '%s')" %
                                      default_bind_address),
                                default=default_bind_address,
                                metavar="ADDRESS")
        optionparser.add_option("-i",
                                "--interface",
                                dest="bind_interface",
                                help=("Bind interface"),
                                metavar="INTERFACE")

    if default_ssl_key is not None and default_ssl_cert is not None:
        optionparser.add_option("--no-ssl",
                                dest="ssl",
                                help="Do not secure HTTP protocol with SSL",
                                default=True,
                                action="store_false")
        optionparser.add_option("-K",
                                "--ssl-key",
                                dest="ssl_key",
                                help=("SSL key path (default: %s)" %
                                      default_ssl_key),
                                default=default_ssl_key,
                                type="string",
                                metavar="SSL_KEY_PATH")
        optionparser.add_option("-C",
                                "--ssl-cert",
                                dest="ssl_cert",
                                help=("SSL certificate path (default: %s)" %
                                      default_ssl_cert),
                                default=default_ssl_cert,
                                type="string",
                                metavar="SSL_CERT_PATH")

    # Disable the use of fork(2) if the daemon uses threads
    if multithreaded:
        utils.DisableFork()

    options, args = optionparser.parse_args()

    if getattr(options, "bind_interface", None) is not None:
        if options.bind_address != default_bind_address:
            msg = (
                "Can't specify both, bind address (%s) and bind interface (%s)"
                % (options.bind_address, options.bind_interface))
            print(msg, file=sys.stderr)
            sys.exit(constants.EXIT_FAILURE)
        interface_ip_addresses = \
          netutils.GetInterfaceIpAddresses(options.bind_interface)
        if family == netutils.IP6Address.family:
            if_addresses = interface_ip_addresses[constants.IP6_VERSION]
        else:
            if_addresses = interface_ip_addresses[constants.IP4_VERSION]
        if len(if_addresses) < 1:
            msg = "Failed to find IP for interface %s" % options.bind_interace
            print(msg, file=sys.stderr)
            sys.exit(constants.EXIT_FAILURE)
        options.bind_address = if_addresses[0]

    if getattr(options, "ssl", False):
        ssl_paths = {
            "certificate": options.ssl_cert,
            "key": options.ssl_key,
        }

        for name, path in ssl_paths.items():
            if not os.path.isfile(path):
                print("SSL %s file '%s' was not found" % (name, path),
                      file=sys.stderr)
                sys.exit(constants.EXIT_FAILURE)

        # TODO: By initiating http.HttpSslParams here we would only read the files
        # once and have a proper validation (isfile returns False on directories)
        # at the same time.

    result, running_uid, expected_uid = _VerifyDaemonUser(daemon_name)
    if not result:
        msg = ("%s started using wrong user ID (%d), expected %d" %
               (daemon_name, running_uid, expected_uid))
        print(msg, file=sys.stderr)
        sys.exit(constants.EXIT_FAILURE)

    if check_fn is not None:
        check_fn(options, args)

    log_filename = constants.DAEMONS_LOGFILES[daemon_name]

    # node-daemon logging in lib/http/server.py, _HandleServerRequestInner
    if options.debug and warn_breach:
        sys.stderr.write(constants.DEBUG_MODE_CONFIDENTIALITY_WARNING %
                         daemon_name)

    if options.fork:
        # Newer GnuTLS versions (>= 3.3.0) use a library constructor for
        # initialization and open /dev/urandom on library load time, way before we
        # fork(). Closing /dev/urandom causes subsequent ganeti.http.client
        # requests to fail and the process to receive a SIGABRT. As we cannot
        # reliably detect GnuTLS's socket, we work our way around this by keeping
        # all fds referring to /dev/urandom open.
        noclose_fds = []
        for fd in os.listdir("/proc/self/fd"):
            try:
                if os.readlink(os.path.join("/proc/self/fd",
                                            fd)) == "/dev/urandom":
                    noclose_fds.append(int(fd))
            except EnvironmentError:
                # The fd might have disappeared (although it shouldn't as we're running
                # single-threaded).
                continue

        utils.CloseFDs(noclose_fds=noclose_fds)
        (wpipe, stdio_reopen_fn) = utils.Daemonize(logfile=log_filename)
    else:
        (wpipe, stdio_reopen_fn) = (None, None)

    log_reopen_fn = \
      utils.SetupLogging(log_filename, daemon_name,
                         debug=options.debug,
                         stderr_logging=not options.fork,
                         multithreaded=multithreaded,
                         syslog=options.syslog,
                         console_logging=console_logging)

    # Reopen log file(s) on SIGHUP
    signal.signal(
        signal.SIGHUP,
        compat.partial(_HandleSigHup, [log_reopen_fn, stdio_reopen_fn]))

    try:
        utils.WritePidFile(utils.DaemonPidFileName(daemon_name))
    except errors.PidFileLockError as err:
        print("Error while locking PID file:\n%s" % err, file=sys.stderr)
        sys.exit(constants.EXIT_FAILURE)

    try:
        try:
            logging.info("%s daemon startup", daemon_name)
            if callable(prepare_fn):
                prep_results = prepare_fn(options, args)
            else:
                prep_results = None
        except Exception as err:
            utils.WriteErrorToFD(wpipe, _BeautifyError(err))
            raise

        if wpipe is not None:
            # we're done with the preparation phase, we close the pipe to
            # let the parent know it's safe to exit
            os.close(wpipe)

        exec_fn(options, args, prep_results)
    finally:
        utils.RemoveFile(utils.DaemonPidFileName(daemon_name))
Exemplo n.º 27
0
def RunNodeSetupCmd(cluster_name, node, basecmd, debug, verbose,
                    use_cluster_key, ask_key, strict_host_check,
                    port, data):
  """Runs a command to configure something on a remote machine.

  @type cluster_name: string
  @param cluster_name: Cluster name
  @type node: string
  @param node: Node name
  @type basecmd: string
  @param basecmd: Base command (path on the remote machine)
  @type debug: bool
  @param debug: Enable debug output
  @type verbose: bool
  @param verbose: Enable verbose output
  @type use_cluster_key: bool
  @param use_cluster_key: See L{ssh.SshRunner.BuildCmd}
  @type ask_key: bool
  @param ask_key: See L{ssh.SshRunner.BuildCmd}
  @type strict_host_check: bool
  @param strict_host_check: See L{ssh.SshRunner.BuildCmd}
  @type port: int
  @param port: The SSH port of the remote machine or None for the default
  @param data: JSON-serializable input data for script (passed to stdin)

  """
  cmd = [basecmd]

  # Pass --debug/--verbose to the external script if set on our invocation
  if debug:
    cmd.append("--debug")

  if verbose:
    cmd.append("--verbose")

  logging.debug("Node setup command: %s", cmd)

  version = constants.DIR_VERSION
  all_cmds = [["test", "-d", os.path.join(pathutils.PKGLIBDIR, version)]]
  if constants.HAS_GNU_LN:
    all_cmds.extend([["ln", "-s", "-f", "-T",
                      os.path.join(pathutils.PKGLIBDIR, version),
                      os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
                     ["ln", "-s", "-f", "-T",
                      os.path.join(pathutils.SHAREDIR, version),
                      os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]])
  else:
    all_cmds.extend([["rm", "-f",
                      os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
                     ["ln", "-s", "-f",
                      os.path.join(pathutils.PKGLIBDIR, version),
                      os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")],
                     ["rm", "-f",
                      os.path.join(pathutils.SYSCONFDIR, "ganeti/share")],
                     ["ln", "-s", "-f",
                      os.path.join(pathutils.SHAREDIR, version),
                      os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]])
  all_cmds.append(cmd)

  if port is None:
    port = netutils.GetDaemonPort(constants.SSH)

  family = ssconf.SimpleStore().GetPrimaryIPFamily()
  srun = ssh.SshRunner(cluster_name,
                       ipv6=(family == netutils.IP6Address.family))
  scmd = srun.BuildCmd(node, constants.SSH_LOGIN_USER,
                       utils.ShellQuoteArgs(
                           utils.ShellCombineCommands(all_cmds)),
                       batch=False, ask_key=ask_key, quiet=False,
                       strict_host_check=strict_host_check,
                       use_cluster_key=use_cluster_key,
                       port=port)

  tempfh = tempfile.TemporaryFile()
  try:
    tempfh.write(serializer.DumpJson(data))
    tempfh.seek(0)

    result = utils.RunCmd(scmd, interactive=True, input_fd=tempfh)
  finally:
    tempfh.close()

  if result.failed:
    raise errors.OpExecError("Command '%s' failed: %s" %
                             (result.cmd, result.fail_reason))

  _WaitForSshDaemon(node, port, family)
Exemplo n.º 28
0
class Transport:
    """Low-level transport class.

  This is used on the client side.

  This could be replaced by any other class that provides the same
  semantics to the Client. This means:
    - can send messages and receive messages
    - safe for multithreading

  """
    def __init__(self, address, timeouts=None):
        """Constructor for the Client class.

    Arguments:
      - address: a valid address the the used transport class
      - timeout: a list of timeouts, to be used on connect and read/write

    There are two timeouts used since we might want to wait for a long
    time for a response, but the connect timeout should be lower.

    If not passed, we use a default of 10 and respectively 60 seconds.

    Note that on reading data, since the timeout applies to an
    invidual receive, it might be that the total duration is longer
    than timeout value passed (we make a hard limit at twice the read
    timeout).

    """
        self.address = address
        if timeouts is None:
            self._ctimeout, self._rwtimeout = DEF_CTMO, DEF_RWTO
        else:
            self._ctimeout, self._rwtimeout = timeouts

        self.socket = None
        self._buffer = ""
        self._msgs = collections.deque()

        try:
            self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)

            # Try to connect
            try:
                utils.Retry(self._Connect,
                            1.0,
                            self._ctimeout,
                            args=(self.socket, address, self._ctimeout))
            except utils.RetryTimeout:
                raise errors.TimeoutError("Connect timed out")

            self.socket.settimeout(self._rwtimeout)
        except (socket.error, errors.NoMasterError):
            if self.socket is not None:
                self.socket.close()
            self.socket = None
            raise

    @staticmethod
    def _Connect(sock, address, timeout):
        sock.settimeout(timeout)
        try:
            sock.connect(address)
        except socket.timeout, err:
            raise errors.TimeoutError("Connect timed out: %s" % str(err))
        except socket.error, err:
            error_code = err.args[0]
            if error_code in (errno.ENOENT, errno.ECONNREFUSED):
                # Verify if we're acutally on the master node before trying
                # again.
                ss = ssconf.SimpleStore()
                try:
                    master, myself = ssconf.GetMasterAndMyself(ss=ss)
                except ganeti.errors.ConfigurationError:
                    raise errors.NoMasterError(address)
                if master != myself:
                    raise errors.NoMasterError(address)
                raise utils.RetryAgain()
            elif error_code in (errno.EPERM, errno.EACCES):
                raise errors.PermissionError(address)
            elif error_code == errno.EAGAIN:
                # Server's socket backlog is full at the moment
                raise utils.RetryAgain()
            raise
Exemplo n.º 29
0
def GenericMain(daemon_name,
                optionparser,
                check_fn,
                prepare_fn,
                exec_fn,
                multithreaded=False,
                console_logging=False,
                default_ssl_cert=None,
                default_ssl_key=None):
    """Shared main function for daemons.

  @type daemon_name: string
  @param daemon_name: daemon name
  @type optionparser: optparse.OptionParser
  @param optionparser: initialized optionparser with daemon-specific options
                       (common -f -d options will be handled by this module)
  @type check_fn: function which accepts (options, args)
  @param check_fn: function that checks start conditions and exits if they're
                   not met
  @type prepare_fn: function which accepts (options, args)
  @param prepare_fn: function that is run before forking, or None;
      it's result will be passed as the third parameter to exec_fn, or
      if None was passed in, we will just pass None to exec_fn
  @type exec_fn: function which accepts (options, args, prepare_results)
  @param exec_fn: function that's executed with the daemon's pid file held, and
                  runs the daemon itself.
  @type multithreaded: bool
  @param multithreaded: Whether the daemon uses threads
  @type console_logging: boolean
  @param console_logging: if True, the daemon will fall back to the system
                          console if logging fails
  @type default_ssl_cert: string
  @param default_ssl_cert: Default SSL certificate path
  @type default_ssl_key: string
  @param default_ssl_key: Default SSL key path

  """
    optionparser.add_option("-f",
                            "--foreground",
                            dest="fork",
                            help="Don't detach from the current terminal",
                            default=True,
                            action="store_false")
    optionparser.add_option("-d",
                            "--debug",
                            dest="debug",
                            help="Enable some debug messages",
                            default=False,
                            action="store_true")
    optionparser.add_option("--syslog",
                            dest="syslog",
                            help="Enable logging to syslog (except debug"
                            " messages); one of 'no', 'yes' or 'only' [%s]" %
                            constants.SYSLOG_USAGE,
                            default=constants.SYSLOG_USAGE,
                            choices=["no", "yes", "only"])

    family = ssconf.SimpleStore().GetPrimaryIPFamily()
    # family will default to AF_INET if there is no ssconf file (e.g. when
    # upgrading a cluster from 2.2 -> 2.3. This is intended, as Ganeti clusters
    # <= 2.2 can not be AF_INET6
    if daemon_name in constants.DAEMONS_PORTS:
        default_bind_address = constants.IP4_ADDRESS_ANY
        if family == netutils.IP6Address.family:
            default_bind_address = constants.IP6_ADDRESS_ANY

        default_port = netutils.GetDaemonPort(daemon_name)

        # For networked daemons we allow choosing the port and bind address
        optionparser.add_option("-p",
                                "--port",
                                dest="port",
                                help="Network port (default: %s)" %
                                default_port,
                                default=default_port,
                                type="int")
        optionparser.add_option("-b",
                                "--bind",
                                dest="bind_address",
                                help=("Bind address (default: '%s')" %
                                      default_bind_address),
                                default=default_bind_address,
                                metavar="ADDRESS")
        optionparser.add_option("-i",
                                "--interface",
                                dest="bind_interface",
                                help=("Bind interface"),
                                metavar="INTERFACE")

    if default_ssl_key is not None and default_ssl_cert is not None:
        optionparser.add_option("--no-ssl",
                                dest="ssl",
                                help="Do not secure HTTP protocol with SSL",
                                default=True,
                                action="store_false")
        optionparser.add_option("-K",
                                "--ssl-key",
                                dest="ssl_key",
                                help=("SSL key path (default: %s)" %
                                      default_ssl_key),
                                default=default_ssl_key,
                                type="string",
                                metavar="SSL_KEY_PATH")
        optionparser.add_option("-C",
                                "--ssl-cert",
                                dest="ssl_cert",
                                help=("SSL certificate path (default: %s)" %
                                      default_ssl_cert),
                                default=default_ssl_cert,
                                type="string",
                                metavar="SSL_CERT_PATH")

    # Disable the use of fork(2) if the daemon uses threads
    if multithreaded:
        utils.DisableFork()

    options, args = optionparser.parse_args()

    if getattr(options, "bind_interface", None) is not None:
        if options.bind_address != default_bind_address:
            msg = (
                "Can't specify both, bind address (%s) and bind interface (%s)"
                % (options.bind_address, options.bind_interface))
            print >> sys.stderr, msg
            sys.exit(constants.EXIT_FAILURE)
        interface_ip_addresses = \
          netutils.GetInterfaceIpAddresses(options.bind_interface)
        if family == netutils.IP6Address.family:
            if_addresses = interface_ip_addresses[constants.IP6_VERSION]
        else:
            if_addresses = interface_ip_addresses[constants.IP4_VERSION]
        if len(if_addresses) < 1:
            msg = "Failed to find IP for interface %s" % options.bind_interace
            print >> sys.stderr, msg
            sys.exit(constants.EXIT_FAILURE)
        options.bind_address = if_addresses[0]

    if getattr(options, "ssl", False):
        ssl_paths = {
            "certificate": options.ssl_cert,
            "key": options.ssl_key,
        }

        for name, path in ssl_paths.iteritems():
            if not os.path.isfile(path):
                print >> sys.stderr, "SSL %s file '%s' was not found" % (name,
                                                                         path)
                sys.exit(constants.EXIT_FAILURE)

        # TODO: By initiating http.HttpSslParams here we would only read the files
        # once and have a proper validation (isfile returns False on directories)
        # at the same time.

    result, running_uid, expected_uid = _VerifyDaemonUser(daemon_name)
    if not result:
        msg = ("%s started using wrong user ID (%d), expected %d" %
               (daemon_name, running_uid, expected_uid))
        print >> sys.stderr, msg
        sys.exit(constants.EXIT_FAILURE)

    if check_fn is not None:
        check_fn(options, args)

    log_filename = constants.DAEMONS_LOGFILES[daemon_name]

    if options.fork:
        utils.CloseFDs()
        (wpipe, stdio_reopen_fn) = utils.Daemonize(logfile=log_filename)
    else:
        (wpipe, stdio_reopen_fn) = (None, None)

    log_reopen_fn = \
      utils.SetupLogging(log_filename, daemon_name,
                         debug=options.debug,
                         stderr_logging=not options.fork,
                         multithreaded=multithreaded,
                         syslog=options.syslog,
                         console_logging=console_logging)

    # Reopen log file(s) on SIGHUP
    signal.signal(
        signal.SIGHUP,
        compat.partial(_HandleSigHup, [log_reopen_fn, stdio_reopen_fn]))

    try:
        utils.WritePidFile(utils.DaemonPidFileName(daemon_name))
    except errors.PidFileLockError, err:
        print >> sys.stderr, "Error while locking PID file:\n%s" % err
        sys.exit(constants.EXIT_FAILURE)