예제 #1
0
    def perspective_jobqueue_set_drain_flag(params):
        """Set job queue's drain flag.

    """
        (flag, ) = params

        return jstore.SetDrainFlag(flag)
예제 #2
0
                            constants.DEFAULT_NODED_PORT) != expected:
            raise utils.RetryAgain()

    try:
        utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False])
    except utils.RetryTimeout:
        warning = ("The master IP is still reachable after %s seconds,"
                   " continuing but activating the master IP on the current"
                   " node will probably fail" % total_timeout)
        logging.warning("%s", warning)
        warnings.append(warning)
        rcode = 1

    if jstore.CheckDrainFlag():
        logging.info("Undraining job queue")
        jstore.SetDrainFlag(False)

    logging.info("Starting the master daemons on the new master")

    result = rpc.BootstrapRunner().call_node_start_master_daemons(
        new_master, no_voting)
    msg = result.fail_msg
    if msg:
        logging.error(
            "Could not start the master role on the new master"
            " %s, please check: %s", new_master, msg)
        rcode = 1

    # Finally verify that the new master managed to set up the master IP
    # and warn if it didn't.
    try:
예제 #3
0
파일: bootstrap.py 프로젝트: vali-um/ganeti
def MasterFailover(no_voting=False):
    """Failover the master node.

  This checks that we are not already the master, and will cause the
  current master to cease being master, and the non-master to become
  new master.

  Note: The call to MasterFailover from lib/client/gnt_cluster.py checks that
  a majority of nodes are healthy and responding before calling this. If this
  function is called from somewhere else, the caller should also verify that a
  majority of nodes are healthy.

  @type no_voting: boolean
  @param no_voting: force the operation without remote nodes agreement
                      (dangerous)

  @returns: the pair of an exit code and warnings to display
  """
    sstore = ssconf.SimpleStore()

    old_master, new_master = ssconf.GetMasterAndMyself(sstore)
    node_names = sstore.GetNodeList()
    mc_list = sstore.GetMasterCandidates()

    if old_master == new_master:
        raise errors.OpPrereqError(
            "This commands must be run on the node"
            " where you want the new master to be."
            " %s is already the master" % old_master, errors.ECODE_INVAL)

    if new_master not in mc_list:
        mc_no_master = [name for name in mc_list if name != old_master]
        raise errors.OpPrereqError(
            "This node is not among the nodes marked"
            " as master candidates. Only these nodes"
            " can become masters. Current list of"
            " master candidates is:\n"
            "%s" % ("\n".join(mc_no_master)), errors.ECODE_STATE)

    if not no_voting:
        vote_list = _GatherMasterVotes(node_names)
        if vote_list:
            voted_master = vote_list[0][0]
            if voted_master != old_master:
                raise errors.OpPrereqError(
                    "I have a wrong configuration, I believe"
                    " the master is %s but the other nodes"
                    " voted %s. Please resync the configuration"
                    " of this node." % (old_master, voted_master),
                    errors.ECODE_STATE)
    # end checks

    rcode = 0
    warnings = []

    logging.info("Setting master to %s, old master: %s", new_master,
                 old_master)

    try:
        # Forcefully start WConfd so that we can access the configuration
        result = utils.RunCmd([
            pathutils.DAEMON_UTIL, "start", constants.WCONFD, "--force-node",
            "--no-voting", "--yes-do-it"
        ])
        if result.failed:
            raise errors.OpPrereqError(
                "Could not start the configuration daemon,"
                " command %s had exitcode %s and error %s" %
                (result.cmd, result.exit_code, result.output),
                errors.ECODE_NOENT)

        # instantiate a real config writer, as we now know we have the
        # configuration data
        livelock = utils.livelock.LiveLock("bootstrap_failover")
        cfg = config.GetConfig(None, livelock, accept_foreign=True)

        old_master_node = cfg.GetNodeInfoByName(old_master)
        if old_master_node is None:
            raise errors.OpPrereqError(
                "Could not find old master node '%s' in"
                " cluster configuration." % old_master, errors.ECODE_NOENT)

        cluster_info = cfg.GetClusterInfo()
        new_master_node = cfg.GetNodeInfoByName(new_master)
        if new_master_node is None:
            raise errors.OpPrereqError(
                "Could not find new master node '%s' in"
                " cluster configuration." % new_master, errors.ECODE_NOENT)

        cluster_info.master_node = new_master_node.uuid
        # this will also regenerate the ssconf files, since we updated the
        # cluster info
        cfg.Update(cluster_info, logging.error)

        # if cfg.Update worked, then it means the old master daemon won't be
        # able now to write its own config file (we rely on locking in both
        # backend.UploadFile() and ConfigWriter._Write(); hence the next
        # step is to kill the old master

        logging.info("Stopping the master daemon on node %s", old_master)

        runner = rpc.BootstrapRunner()
        master_params = cfg.GetMasterNetworkParameters()
        master_params.uuid = old_master_node.uuid
        ems = cfg.GetUseExternalMipScript()
        result = runner.call_node_deactivate_master_ip(old_master,
                                                       master_params, ems)

        msg = result.fail_msg
        if msg:
            warning = "Could not disable the master IP: %s" % (msg, )
            logging.warning("%s", warning)
            warnings.append(warning)

        result = runner.call_node_stop_master(old_master)
        msg = result.fail_msg
        if msg:
            warning = ("Could not disable the master role on the old master"
                       " %s, please disable manually: %s" % (old_master, msg))
            logging.error("%s", warning)
            warnings.append(warning)
    except errors.ConfigurationError as err:
        logging.error("Error while trying to set the new master: %s", str(err))
        return 1, warnings
    finally:
        # stop WConfd again:
        result = utils.RunCmd(
            [pathutils.DAEMON_UTIL, "stop", constants.WCONFD])
        if result.failed:
            warning = ("Could not stop the configuration daemon,"
                       " command %s had exitcode %s and error %s" %
                       (result.cmd, result.exit_code, result.output))
            logging.error("%s", warning)
            rcode = 1

    logging.info("Checking master IP non-reachability...")

    master_ip = sstore.GetMasterIP()
    total_timeout = 30

    # Here we have a phase where no master should be running
    def _check_ip(expected):
        if netutils.TcpPing(master_ip,
                            constants.DEFAULT_NODED_PORT) != expected:
            raise utils.RetryAgain()

    try:
        utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[False])
    except utils.RetryTimeout:
        warning = ("The master IP is still reachable after %s seconds,"
                   " continuing but activating the master IP on the current"
                   " node will probably fail" % total_timeout)
        logging.warning("%s", warning)
        warnings.append(warning)
        rcode = 1

    if jstore.CheckDrainFlag():
        logging.info("Undraining job queue")
        jstore.SetDrainFlag(False)

    logging.info("Starting the master daemons on the new master")

    result = rpc.BootstrapRunner().call_node_start_master_daemons(
        new_master, no_voting)
    msg = result.fail_msg
    if msg:
        logging.error(
            "Could not start the master role on the new master"
            " %s, please check: %s", new_master, msg)
        rcode = 1

    # Finally verify that the new master managed to set up the master IP
    # and warn if it didn't.
    try:
        utils.Retry(_check_ip, (1, 1.5, 5), total_timeout, args=[True])
    except utils.RetryTimeout:
        warning = ("The master IP did not come up within %s seconds; the"
                   " cluster should still be working and reachable via %s,"
                   " but not via the master IP address" %
                   (total_timeout, new_master))
        logging.warning("%s", warning)
        warnings.append(warning)
        rcode = 1

    logging.info("Master failed over from %s to %s", old_master, new_master)
    return rcode, warnings