Пример #1
0
def config_sync(env, skip_offline_nodes=False):
    """
    Send specified local booth configuration to all nodes in cluster.

    env -- LibraryEnvironment
    skip_offline_nodes -- if True offline nodes will be skipped
    """
    config = env.booth.get_config_content()
    authfile_path = config_structure.get_authfile(parse(config))
    authfile_content = config_files.read_authfile(
        env.report_processor, authfile_path
    )

    cluster_nodes_names, report_list = get_existing_nodes_names(
        env.get_corosync_conf()
    )
    if not cluster_nodes_names:
        report_list.append(reports.corosync_config_no_nodes_defined())
    env.report_processor.process_list(report_list)

    com_cmd = BoothSendConfig(
        env.report_processor,
        env.booth.name,
        config,
        authfile=authfile_path,
        authfile_data=authfile_content,
        skip_offline_targets=skip_offline_nodes
    )
    com_cmd.set_targets(
        env.get_node_target_factory().get_target_list(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        )
    )
    run_and_raise(env.get_node_communicator(), com_cmd)
Пример #2
0
def get_cluster_sbd_config(lib_env):
    """
    Returns list of SBD config from all cluster nodes in cluster. Structure
    of data:
    [
        {
            "node": <NodeAddress>
            "config": <sbd_config_dict> or None if there was failure,
        },
        ...
    ]
    If error occurs while obtaining config from some node, it's config will be
    None. If obtaining config fail on all node returns empty dictionary.

    lib_env -- LibraryEnvironment
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf()
    )
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    lib_env.report_processor.process_list(get_nodes_report_list)

    com_cmd = GetSbdConfig(lib_env.report_processor)
    com_cmd.set_targets(
        lib_env.get_node_target_factory().get_target_list(
            node_list,
            skip_non_existing=True
        )
    )
    return run_com(lib_env.get_node_communicator(), com_cmd)
Пример #3
0
    def push_corosync_conf(
        self, corosync_conf_facade, skip_offline_nodes=False
    ):
        corosync_conf_data = corosync_conf_facade.config.export()
        if self.is_corosync_conf_live:
            node_name_list, report_list = get_existing_nodes_names(
                corosync_conf_facade,
                # Pcs is unable to communicate with nodes missing names. It
                # cannot send new corosync.conf to them. That might break the
                # cluster. Hence we error out.
                error_on_missing_name=True
            )
            self.report_processor.process_list(report_list)

            self._push_corosync_conf_live(
                self.get_node_target_factory().get_target_list(
                    node_name_list,
                    skip_non_existing=skip_offline_nodes,
                ),
                corosync_conf_data,
                corosync_conf_facade.need_stopped_cluster,
                corosync_conf_facade.need_qdevice_reload,
                skip_offline_nodes,
            )
        else:
            self._corosync_conf_data = corosync_conf_data
Пример #4
0
def get_cluster_sbd_status(lib_env):
    """
    Returns status of SBD service in cluster in dictionary with format:
    {
        <NodeAddress>: {
            "installed": <boolean>,
            "enabled": <boolean>,
            "running": <boolean>
        },
        ...
    }

    lib_env -- LibraryEnvironment
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf()
    )
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    lib_env.report_processor.process_list(get_nodes_report_list)

    com_cmd = GetSbdStatus(lib_env.report_processor)
    com_cmd.set_targets(
        lib_env.get_node_target_factory().get_target_list(
            node_list,
            skip_non_existing=True
        )
    )
    return run_com(lib_env.get_node_communicator(), com_cmd)
Пример #5
0
def quorum_unblock_cmd(lib, argv, modifiers):
    """
    Options:
      * --force - no error when removing non existing property and no warning
        about this action
    """
    modifiers.ensure_only_supported("--force")
    if argv:
        raise CmdLineInputError()

    output, retval = utils.run(
        ["corosync-cmapctl", "-g", "runtime.votequorum.wait_for_all_status"]
    )
    if retval != 0:
        utils.err("unable to check quorum status")
    if output.split("=")[-1].strip() != "1":
        utils.err("cluster is not waiting for nodes to establish quorum")

    all_nodes, report_list = get_existing_nodes_names(
        utils.get_corosync_conf_facade()
    )
    if report_list:
        utils.process_library_reports(report_list)

    unjoined_nodes = set(all_nodes) - set(utils.getCorosyncActiveNodes())
    if not unjoined_nodes:
        utils.err("no unjoined nodes found")
    if not modifiers.get("--force"):
        answer = utils.get_terminal_input(
            (
                "WARNING: If node(s) {nodes} are not powered off or they do"
                + " have access to shared resources, data corruption and/or"
                + " cluster failure may occur. Are you sure you want to"
                + " continue? [y/N] "
            ).format(nodes=", ".join(unjoined_nodes))
        )
        if answer.lower() not in ["y", "yes"]:
            print("Canceled")
            return
    for node in unjoined_nodes:
        # pass --force so no warning will be displayed
        stonith.stonith_confirm(
            lib, [node], parse_args.InputModifiers({"--force": ""})
        )

    output, retval = utils.run(
        ["corosync-cmapctl", "-s", "quorum.cancel_wait_for_all", "u8", "1"]
    )
    if retval != 0:
        utils.err("unable to cancel waiting for nodes")
    print("Quorum unblocked")

    startup_fencing = utils.get_set_properties().get("startup-fencing", "")
    utils.set_cib_property(
        "startup-fencing",
        "false" if startup_fencing.lower() != "false" else "true"
    )
    utils.set_cib_property("startup-fencing", startup_fencing)
    print("Waiting for nodes canceled")
Пример #6
0
def disable_sbd(lib_env, ignore_offline_nodes=False):
    """
    Disable SBD on all nodes in cluster.

    lib_env -- LibraryEnvironment
    ignore_offline_nodes -- if True, omit offline nodes
    """
    node_list, get_nodes_report_list = get_existing_nodes_names(
        lib_env.get_corosync_conf()
    )
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    lib_env.report_processor.process_list(get_nodes_report_list)

    com_cmd = GetOnlineTargets(
        lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes,
    )
    com_cmd.set_targets(
        lib_env.get_node_target_factory().get_target_list(
            node_list,
            skip_non_existing=ignore_offline_nodes,
        )
    )
    online_nodes = run_and_raise(lib_env.get_node_communicator(), com_cmd)

    com_cmd = SetStonithWatchdogTimeoutToZero(lib_env.report_processor)
    com_cmd.set_targets(online_nodes)
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    com_cmd = DisableSbdService(lib_env.report_processor)
    com_cmd.set_targets(online_nodes)
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    lib_env.report_processor.process(
        reports.cluster_restart_required_to_apply_changes()
    )
Пример #7
0
def nodes_status(lib, argv, modifiers):
    """
    Options:
      * -f - CIB file - for config subcommand and not for both or corosync
      * --corosync_conf - only for config subcommand

    NOTE: modifiers check is in subcommand
    """
    del lib
    if len(argv) == 1 and (argv[0] == "config"):
        modifiers.ensure_only_supported("-f", "--corosync_conf")
        if utils.hasCorosyncConf():
            corosync_nodes, report_list = get_existing_nodes_names(
                utils.get_corosync_conf_facade())
            if report_list:
                process_library_reports(report_list)
        else:
            corosync_nodes = []
        try:
            pacemaker_nodes = sorted([
                node.attrs.name for node in ClusterState(
                    get_cluster_status_dom(
                        utils.cmd_runner())).node_section.nodes
                if node.attrs.type != "remote"
            ])
        except LibraryError as e:
            process_library_reports(e.args)
        print("Corosync Nodes:")
        if corosync_nodes:
            print(" " + " ".join(corosync_nodes))
        print("Pacemaker Nodes:")
        if pacemaker_nodes:
            print(" " + " ".join(pacemaker_nodes))

        return

    if len(argv) == 1 and (argv[0] == "corosync" or argv[0] == "both"):
        modifiers.ensure_only_supported()
        all_nodes, report_list = get_existing_nodes_names(
            utils.get_corosync_conf_facade())
        if report_list:
            process_library_reports(report_list)
        online_nodes = utils.getCorosyncActiveNodes()
        offline_nodes = []
        for node in all_nodes:
            if node not in online_nodes:
                offline_nodes.append(node)

        online_nodes.sort()
        offline_nodes.sort()
        print("Corosync Nodes:")
        print(" ".join([" Online:"] + online_nodes))
        print(" ".join([" Offline:"] + offline_nodes))
        if argv[0] != "both":
            sys.exit(0)

    modifiers.ensure_only_supported("-f")
    info_dom = utils.getClusterState()

    nodes = info_dom.getElementsByTagName("nodes")
    if nodes.length == 0:
        utils.err("No nodes section found")

    onlinenodes = []
    offlinenodes = []
    standbynodes = []
    standbynodes_with_resources = []
    maintenancenodes = []
    remote_onlinenodes = []
    remote_offlinenodes = []
    remote_standbynodes = []
    remote_standbynodes_with_resources = []
    remote_maintenancenodes = []
    for node in nodes[0].getElementsByTagName("node"):
        node_name = node.getAttribute("name")
        node_remote = node.getAttribute("type") == "remote"
        if node.getAttribute("online") == "true":
            if node.getAttribute("standby") == "true":
                is_running_resources = (node.getAttribute("resources_running")
                                        != "0")
                if node_remote:
                    if is_running_resources:
                        remote_standbynodes_with_resources.append(node_name)
                    else:
                        remote_standbynodes.append(node_name)
                else:
                    if is_running_resources:
                        standbynodes_with_resources.append(node_name)
                    else:
                        standbynodes.append(node_name)
            if node.getAttribute("maintenance") == "true":
                if node_remote:
                    remote_maintenancenodes.append(node_name)
                else:
                    maintenancenodes.append(node_name)
            if (node.getAttribute("standby") == "false"
                    and node.getAttribute("maintenance") == "false"):
                if node_remote:
                    remote_onlinenodes.append(node_name)
                else:
                    onlinenodes.append(node_name)
        else:
            if node_remote:
                remote_offlinenodes.append(node_name)
            else:
                offlinenodes.append(node_name)

    print("Pacemaker Nodes:")
    print(" ".join([" Online:"] + onlinenodes))
    print(" ".join([" Standby:"] + standbynodes))
    print(" ".join([" Standby with resource(s) running:"] +
                   standbynodes_with_resources))
    print(" ".join([" Maintenance:"] + maintenancenodes))
    print(" ".join([" Offline:"] + offlinenodes))

    print("Pacemaker Remote Nodes:")
    print(" ".join([" Online:"] + remote_onlinenodes))
    print(" ".join([" Standby:"] + remote_standbynodes))
    print(" ".join([" Standby with resource(s) running:"] +
                   remote_standbynodes_with_resources))
    print(" ".join([" Maintenance:"] + remote_maintenancenodes))
    print(" ".join([" Offline:"] + remote_offlinenodes))
Пример #8
0
def status_all_sites_plaintext(
    env: LibraryEnvironment,
    hide_inactive_resources: bool = False,
    verbose: bool = False,
) -> List[Mapping[str, Any]]:
    """
    Return local site's and all remote sites' status as plaintext

    env -- LibraryEnvironment
    hide_inactive_resources -- if True, do not display non-running resources
    verbose -- if True, display more info
    """

    # The command does not provide an option to skip offline / unreacheable /
    # misbehaving nodes.
    # The point of such skipping is to stop a command if it is unable to make
    # changes on all nodes. The user can then decide to proceed anyway and
    # make changes on the skipped nodes later manually.
    # This command only reads from nodes so it automatically asks other nodes
    # if one is offline / misbehaving.
    class SiteData():
        def __init__(
            self,
            local: bool,
            role: DrRole,
            target_list: Iterable[RequestTarget],
        ) -> None:
            self.local = local
            self.role = role
            self.target_list = target_list
            self.status_loaded = False
            self.status_plaintext = ""

    if env.ghost_file_codes:
        raise LibraryError(
            reports.live_environment_required(env.ghost_file_codes))

    report_processor = env.report_processor
    report_list, dr_config = _load_dr_config(env.get_dr_env().config)
    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    site_data_list = []
    target_factory = env.get_node_target_factory()

    # get local nodes
    local_nodes, report_list = get_existing_nodes_names(
        env.get_corosync_conf())
    report_processor.report_list(report_list)
    report_list, local_targets = target_factory.get_target_list_with_reports(
        local_nodes,
        skip_non_existing=True,
    )
    report_processor.report_list(report_list)
    site_data_list.append(SiteData(True, dr_config.local_role, local_targets))

    # get remote sites' nodes
    for conf_remote_site in dr_config.get_remote_site_list():
        report_list, remote_targets = (
            target_factory.get_target_list_with_reports(
                conf_remote_site.node_name_list,
                skip_non_existing=True,
            ))
        report_processor.report_list(report_list)
        site_data_list.append(
            SiteData(False, conf_remote_site.role, remote_targets))
    if report_processor.has_errors:
        raise LibraryError()

    # get all statuses
    for site_data in site_data_list:
        com_cmd = GetFullClusterStatusPlaintext(
            report_processor,
            hide_inactive_resources=hide_inactive_resources,
            verbose=verbose,
        )
        com_cmd.set_targets(site_data.target_list)
        site_data.status_loaded, site_data.status_plaintext = run_com_cmd(
            env.get_node_communicator(), com_cmd)

    return [
        dto.to_dict(
            DrSiteStatusDto(
                local_site=site_data.local,
                site_role=site_data.role,
                status_plaintext=site_data.status_plaintext,
                status_successfully_obtained=site_data.status_loaded,
            )) for site_data in site_data_list
    ]
Пример #9
0
def add_device(
    lib_env, model, model_options, generic_options, heuristics_options,
    force_model=False, force_options=False, skip_offline_nodes=False
):
    """
    Add a quorum device to a cluster, distribute and reload configs if live

    string model -- quorum device model
    dict model_options -- model specific options
    dict generic_options -- generic quorum device options
    dict heuristics_options -- heuristics options
    bool force_model -- continue even if the model is not valid
    bool force_options -- continue even if options are not valid
    bool skip_offline_nodes -- continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if cfg.has_quorum_device():
        raise LibraryError(reports.qdevice_already_defined())

    report_processor = SimpleReportProcessor(lib_env.report_processor)
    report_processor.report_list(
        corosync_conf_validators.add_quorum_device(
            model,
            model_options,
            generic_options,
            heuristics_options,
            [node.nodeid for node in cfg.get_nodes()],
            force_model=force_model,
            force_options=force_options
        )
    )

    if lib_env.is_corosync_conf_live:
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True
        )
        report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    cfg.add_quorum_device(
        model,
        model_options,
        generic_options,
        heuristics_options,
    )
    if cfg.is_quorum_device_heuristics_enabled_with_no_exec():
        lib_env.report_processor.process(
            reports.corosync_quorum_heuristics_enabled_with_no_exec()
        )

    # First setup certificates for qdevice, then send corosync.conf to nodes.
    # If anything fails, nodes will not have corosync.conf with qdevice in it,
    # so there is no effect on the cluster.
    if lib_env.is_corosync_conf_live:
        target_factory = lib_env.get_node_target_factory()
        target_list = target_factory.get_target_list(
            cluster_nodes_names, skip_non_existing=skip_offline_nodes,
        )
        # Do model specific configuration.
        # If the model is not known to pcs and was forced, do not configure
        # anything else than corosync.conf, as we do not know what to do
        # anyway.
        if model == "net":
            qdevice_net.set_up_client_certificates(
                lib_env.cmd_runner(),
                lib_env.report_processor,
                lib_env.communicator_factory,
                # We are sure the "host" key is there, it has been validated
                # above.
                target_factory.get_target_from_hostname(model_options["host"]),
                cfg.get_cluster_name(),
                target_list,
                skip_offline_nodes
            )

        lib_env.report_processor.process(
            reports.service_enable_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Enable(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # everything set up, it's safe to tell the nodes to use qdevice
    lib_env.push_corosync_conf(cfg, skip_offline_nodes)

    # Now, when corosync.conf has been reloaded, we can start qdevice service.
    if lib_env.is_corosync_conf_live:
        lib_env.report_processor.process(
            reports.service_start_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Start(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
Пример #10
0
def remove_device(lib_env, skip_offline_nodes=False):
    """
    Stop using quorum device, distribute and reload configs if live
    skip_offline_nodes continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if not cfg.has_quorum_device():
        raise LibraryError(reports.qdevice_not_defined())
    model = cfg.get_quorum_device_model()
    cfg.remove_quorum_device()

    if lib_env.is_corosync_conf_live:
        report_processor = SimpleReportProcessor(lib_env.report_processor)
        # get nodes for communication
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True
        )
        report_processor.report_list(report_list)
        if report_processor.has_errors:
            raise LibraryError()
        target_list = lib_env.get_node_target_factory().get_target_list(
            cluster_nodes_names, skip_non_existing=skip_offline_nodes,
        )
        # fix quorum options for SBD to work properly
        if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg):
            lib_env.report_processor.process(
                reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()
            )
            cfg.set_quorum_options({"auto_tie_breaker": "1"})

        # disable qdevice
        lib_env.report_processor.process(
            reports.service_disable_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Disable(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
        # stop qdevice
        lib_env.report_processor.process(
            reports.service_stop_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Stop(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
        # handle model specific configuration
        if model == "net":
            lib_env.report_processor.process(
                reports.qdevice_certificate_removal_started()
            )
            com_cmd = qdevice_net_com.ClientDestroy(
                lib_env.report_processor, skip_offline_nodes
            )
            com_cmd.set_targets(target_list)
            run_and_raise(lib_env.get_node_communicator(), com_cmd)

    lib_env.push_corosync_conf(cfg, skip_offline_nodes)
Пример #11
0
def remove_device(lib_env: LibraryEnvironment, skip_offline_nodes=False):
    """
    Stop using quorum device, distribute and reload configs if live
    skip_offline_nodes continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if not cfg.has_quorum_device():
        raise LibraryError(
            ReportItem.error(reports.messages.QdeviceNotDefined()))
    model = cfg.get_quorum_device_model()
    cfg.remove_quorum_device()

    if lib_env.is_corosync_conf_live:
        report_processor = lib_env.report_processor
        # get nodes for communication
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True,
        )
        if report_processor.report_list(report_list).has_errors:
            raise LibraryError()
        target_list = lib_env.get_node_target_factory().get_target_list(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        )
        # fix quorum options for SBD to work properly
        if sbd.atb_has_to_be_enabled(lib_env.service_manager, cfg):
            lib_env.report_processor.report(
                ReportItem.warning(
                    reports.messages.CorosyncQuorumAtbWillBeEnabledDueToSbd()))
            cfg.set_quorum_options({"auto_tie_breaker": "1"})

        # disable qdevice
        lib_env.report_processor.report(
            ReportItem.info(
                reports.messages.ServiceActionStarted(
                    reports.const.SERVICE_ACTION_DISABLE, "corosync-qdevice")))
        com_cmd_disable = qdevice_com.Disable(lib_env.report_processor,
                                              skip_offline_nodes)
        com_cmd_disable.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd_disable)
        # stop qdevice
        lib_env.report_processor.report(
            ReportItem.info(
                reports.messages.ServiceActionStarted(
                    reports.const.SERVICE_ACTION_STOP, "corosync-qdevice")))
        com_cmd_stop = qdevice_com.Stop(lib_env.report_processor,
                                        skip_offline_nodes)
        com_cmd_stop.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd_stop)
        # handle model specific configuration
        if model == "net":
            lib_env.report_processor.report(
                ReportItem.info(
                    reports.messages.QdeviceCertificateRemovalStarted()))
            com_cmd_client_destroy = qdevice_net_com.ClientDestroy(
                lib_env.report_processor, skip_offline_nodes)
            com_cmd_client_destroy.set_targets(target_list)
            run_and_raise(lib_env.get_node_communicator(),
                          com_cmd_client_destroy)

    lib_env.push_corosync_conf(cfg, skip_offline_nodes)
Пример #12
0
def full_cluster_status_plaintext(
    env: LibraryEnvironment,
    hide_inactive_resources: bool = False,
    verbose: bool = False,
) -> str:
    """
    Return full cluster status as plaintext

    env -- LibraryEnvironment
    hide_inactive_resources -- if True, do not display non-running resources
    verbose -- if True, display more info
    """
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-locals

    # validation
    if not env.is_cib_live and env.is_corosync_conf_live:
        raise LibraryError(
            reports.live_environment_not_consistent(
                [file_type_codes.CIB],
                [file_type_codes.COROSYNC_CONF],
            ))
    if env.is_cib_live and not env.is_corosync_conf_live:
        raise LibraryError(
            reports.live_environment_not_consistent(
                [file_type_codes.COROSYNC_CONF],
                [file_type_codes.CIB],
            ))

    # initialization
    runner = env.cmd_runner()
    report_processor = SimpleReportProcessor(env.report_processor)
    live = env.is_cib_live and env.is_corosync_conf_live
    is_sbd_running = False

    # load status, cib, corosync.conf
    status_text, warning_list = get_cluster_status_text(
        runner, hide_inactive_resources, verbose)
    corosync_conf = env.get_corosync_conf()
    cib = env.get_cib()
    if verbose:
        ticket_status_text, ticket_status_stderr, ticket_status_retval = (
            get_ticket_status_text(runner))
    # get extra info if live
    if live:
        try:
            is_sbd_running = is_service_running(runner, get_sbd_service_name())
        except LibraryError:
            pass
        local_services_status = _get_local_services_status(runner)
        if verbose:
            node_name_list, node_names_report_list = get_existing_nodes_names(
                corosync_conf)
            report_processor.report_list(node_names_report_list)
            node_reachability = _get_node_reachability(
                env.get_node_target_factory(),
                env.get_node_communicator(),
                report_processor,
                node_name_list,
            )

    # check stonith configuration
    warning_list = list(warning_list)
    warning_list.extend(_stonith_warnings(cib, is_sbd_running))

    # put it all together
    if report_processor.has_errors:
        raise LibraryError()

    parts = []
    parts.append(f"Cluster name: {corosync_conf.get_cluster_name()}")
    if warning_list:
        parts.extend(["", "WARNINGS:"] + warning_list + [""])
    parts.append(status_text)
    if verbose:
        parts.extend(["", "Tickets:"])
        if ticket_status_retval != 0:
            ticket_warning_parts = [
                "WARNING: Unable to get information about tickets"
            ]
            if ticket_status_stderr:
                ticket_warning_parts.extend(
                    indent(ticket_status_stderr.splitlines()))
            parts.extend(indent(ticket_warning_parts))
        else:
            parts.extend(indent(ticket_status_text.splitlines()))
    if live:
        if verbose:
            parts.extend(["", "PCSD Status:"])
            parts.extend(
                indent(
                    _format_node_reachability(node_name_list,
                                              node_reachability)))
        parts.extend(["", "Daemon Status:"])
        parts.extend(
            indent(_format_local_services_status(local_services_status)))
    return "\n".join(parts)
Пример #13
0
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None:
    """
    Set up disaster recovery with the local cluster being the primary site

    env
    node_name -- a known host from the recovery site
    """
    # pylint: disable=too-many-locals
    if env.ghost_file_codes:
        raise LibraryError(
            ReportItem.error(
                reports.messages.LiveEnvironmentRequired(
                    env.ghost_file_codes)))
    report_processor = env.report_processor
    dr_env = env.get_dr_env()
    if dr_env.config.raw_file.exists():
        report_processor.report(
            ReportItem.error(reports.messages.DrConfigAlreadyExist()))
    target_factory = env.get_node_target_factory()

    local_nodes, report_list = get_existing_nodes_names(
        env.get_corosync_conf(), error_on_missing_name=True)
    report_processor.report_list(report_list)

    if node_name in local_nodes:
        report_processor.report(
            ReportItem.error(reports.messages.NodeInLocalCluster(node_name)))

    report_list, local_targets = target_factory.get_target_list_with_reports(
        local_nodes, allow_skip=False, report_none_host_found=False)
    report_processor.report_list(report_list)

    report_list, remote_targets = target_factory.get_target_list_with_reports(
        [node_name], allow_skip=False, report_none_host_found=False)
    report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    # TODO The new file framework doesn't support network communication yet.
    com_cmd = GetCorosyncConf(env.report_processor)
    com_cmd.set_targets(remote_targets)
    corosync_conf_instance = FileInstance.for_corosync_conf()
    try:
        remote_cluster_nodes, report_list = get_existing_nodes_names(
            cast(
                CorosyncConfigFacade,
                corosync_conf_instance.raw_to_facade(
                    run_and_raise(env.get_node_communicator(),
                                  com_cmd).encode("utf-8")),
            ),
            error_on_missing_name=True,
        )
    except ParserErrorException as e:
        report_processor.report_list(
            corosync_conf_instance.toolbox.parser.exception_to_report_list(
                e,
                file_type_codes.COROSYNC_CONF,
                None,
                force_code=None,
                is_forced_or_warning=False,
            ))

    if report_processor.report_list(report_list).has_errors:
        raise LibraryError()

    # ensure we have tokens for all nodes of remote cluster
    report_list, remote_targets = target_factory.get_target_list_with_reports(
        remote_cluster_nodes, allow_skip=False, report_none_host_found=False)
    if report_processor.report_list(report_list).has_errors:
        raise LibraryError()
    dr_config_exporter = get_file_toolbox(
        file_type_codes.PCS_DR_CONFIG).exporter
    # create dr config for remote cluster
    remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY)
    remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes)
    # send config to all node of remote cluster
    distribute_file_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.pcs_dr_config_file(
            dr_config_exporter.export(remote_dr_cfg.config)),
    )
    distribute_file_cmd.set_targets(remote_targets)
    run_and_raise(env.get_node_communicator(), distribute_file_cmd)
    # create new dr config, with local cluster as primary site
    local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY)
    local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes)
    distribute_file_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.pcs_dr_config_file(
            dr_config_exporter.export(local_dr_cfg.config)),
    )
    distribute_file_cmd.set_targets(local_targets)
    run_and_raise(env.get_node_communicator(), distribute_file_cmd)
Пример #14
0
def location_prefer(lib, argv, modifiers):
    """
    Options:
      * --force - allow unknown options, allow constraint for any resource type
      * -f - CIB file
    """
    modifiers.ensure_only_supported("--force", "-f")
    rsc = argv.pop(0)
    prefer_option = argv.pop(0)

    dummy_rsc_type, rsc_value = parse_args.parse_typed_arg(
        rsc, [RESOURCE_TYPE_RESOURCE, RESOURCE_TYPE_REGEXP],
        RESOURCE_TYPE_RESOURCE)

    if prefer_option == "prefers":
        prefer = True
    elif prefer_option == "avoids":
        prefer = False
    else:
        raise CmdLineInputError()

    skip_node_check = False
    if modifiers.is_specified("-f") or modifiers.get("--force"):
        skip_node_check = True
        warn(LOCATION_NODE_VALIDATION_SKIP_MSG)
    else:
        lib_env = utils.get_lib_env()
        existing_nodes, report_list = get_existing_nodes_names(
            corosync_conf=lib_env.get_corosync_conf(),
            cib=lib_env.get_cib(),
        )
        if report_list:
            process_library_reports(report_list)

    report_list = []
    parameters_list = []
    for nodeconf in argv:
        nodeconf_a = nodeconf.split("=", 1)
        node = nodeconf_a[0]
        if not skip_node_check:
            report_list += _verify_node_name(node, existing_nodes)
        if len(nodeconf_a) == 1:
            if prefer:
                score = "INFINITY"
            else:
                score = "-INFINITY"
        else:
            score = nodeconf_a[1]
            _verify_score(score)
            if not prefer:
                if score[0] == "-":
                    score = score[1:]
                else:
                    score = "-" + score

        parameters_list.append([
            sanitize_id(f"location-{rsc_value}-{node}-{score}"), rsc, node,
            score
        ])

    if report_list:
        process_library_reports(report_list)

    modifiers = modifiers.get_subset("--force", "-f")

    for parameters in parameters_list:
        location_add(lib,
                     parameters,
                     modifiers,
                     skip_score_and_node_check=True)
Пример #15
0
def full_cluster_status_plaintext(
    env: LibraryEnvironment,
    hide_inactive_resources: bool = False,
    verbose: bool = False,
) -> str:
    """
    Return full cluster status as plaintext

    env -- LibraryEnvironment
    hide_inactive_resources -- if True, do not display non-running resources
    verbose -- if True, display more info
    """
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-locals
    # pylint: disable=too-many-statements

    # validation
    if not env.is_cib_live and env.is_corosync_conf_live:
        raise LibraryError(
            ReportItem.error(
                reports.messages.LiveEnvironmentNotConsistent(
                    [file_type_codes.CIB],
                    [file_type_codes.COROSYNC_CONF],
                )))
    if env.is_cib_live and not env.is_corosync_conf_live:
        raise LibraryError(
            ReportItem.error(
                reports.messages.LiveEnvironmentNotConsistent(
                    [file_type_codes.COROSYNC_CONF],
                    [file_type_codes.CIB],
                )))

    # initialization
    runner = env.cmd_runner()
    report_processor = env.report_processor
    live = env.is_cib_live and env.is_corosync_conf_live
    is_sbd_running = False

    # load status, cib, corosync.conf
    status_text, warning_list = get_cluster_status_text(
        runner, hide_inactive_resources, verbose)
    corosync_conf = None
    # If we are live on a remote node, we have no corosync.conf.
    # TODO Use the new file framework so the path is not exposed.
    if not live or os.path.exists(settings.corosync_conf_file):
        corosync_conf = env.get_corosync_conf()
    cib = env.get_cib()
    if verbose:
        (
            ticket_status_text,
            ticket_status_stderr,
            ticket_status_retval,
        ) = get_ticket_status_text(runner)
    # get extra info if live
    if live:
        try:
            is_sbd_running = is_service_running(runner, get_sbd_service_name())
        except LibraryError:
            pass
        local_services_status = _get_local_services_status(runner)
        if verbose and corosync_conf:
            node_name_list, node_names_report_list = get_existing_nodes_names(
                corosync_conf)
            report_processor.report_list(node_names_report_list)
            node_reachability = _get_node_reachability(
                env.get_node_target_factory(),
                env.get_node_communicator(),
                report_processor,
                node_name_list,
            )

    # check stonith configuration
    warning_list = list(warning_list)
    warning_list.extend(_stonith_warnings(cib, is_sbd_running))

    # put it all together
    if report_processor.has_errors:
        raise LibraryError()

    cluster_name = (corosync_conf.get_cluster_name() if corosync_conf else
                    nvpair.get_value("cluster_property_set",
                                     get_crm_config(cib), "cluster-name", ""))
    parts = []
    parts.append(f"Cluster name: {cluster_name}")
    if warning_list:
        parts.extend(["", "WARNINGS:"] + warning_list + [""])
    parts.append(status_text)
    if verbose:
        parts.extend(["", "Tickets:"])
        if ticket_status_retval != 0:
            ticket_warning_parts = [
                "WARNING: Unable to get information about tickets"
            ]
            if ticket_status_stderr:
                ticket_warning_parts.extend(
                    indent(ticket_status_stderr.splitlines()))
            parts.extend(indent(ticket_warning_parts))
        else:
            parts.extend(indent(ticket_status_text.splitlines()))
    if live:
        if verbose and corosync_conf:
            parts.extend(["", "PCSD Status:"])
            parts.extend(
                indent(
                    _format_node_reachability(node_name_list,
                                              node_reachability)))
        parts.extend(["", "Daemon Status:"])
        parts.extend(
            indent(_format_local_services_status(local_services_status)))
    return "\n".join(parts)
Пример #16
0
def update_scsi_devices(
    env: LibraryEnvironment,
    stonith_id: str,
    set_device_list: Iterable[str],
    force_flags: Container[reports.types.ForceCode] = (),
) -> None:
    """
    Update scsi fencing devices without restart and affecting other resources.

    env -- provides all for communication with externals
    stonith_id -- id of stonith resource
    set_device_list -- paths to the scsi devices that would be set for stonith
        resource
    force_flags -- list of flags codes
    """
    if not is_getting_resource_digest_supported(env.cmd_runner()):
        raise LibraryError(
            ReportItem.error(
                reports.messages.StonithRestartlessUpdateOfScsiDevicesNotSupported()
            )
        )
    cib = env.get_cib()
    if not set_device_list:
        env.report_processor.report(
            ReportItem.error(
                reports.messages.InvalidOptionValue(
                    "devices", "", None, cannot_be_empty=True
                )
            )
        )
    (
        stonith_el,
        report_list,
    ) = stonith.validate_stonith_restartless_update(cib, stonith_id)
    if env.report_processor.report_list(report_list).has_errors:
        raise LibraryError()
    # for mypy, this should not happen because exeption would be raised
    if stonith_el is None:
        raise AssertionError("stonith element is None")

    stonith.update_scsi_devices_without_restart(
        env.cmd_runner(),
        env.get_cluster_state(),
        stonith_el,
        IdProvider(cib),
        set_device_list,
    )

    # Unfencing
    cluster_nodes_names, nodes_report_list = get_existing_nodes_names(
        env.get_corosync_conf(),
        error_on_missing_name=True,
    )
    env.report_processor.report_list(nodes_report_list)
    (
        target_report_list,
        cluster_nodes_target_list,
    ) = env.get_node_target_factory().get_target_list_with_reports(
        cluster_nodes_names,
        allow_skip=False,
    )
    env.report_processor.report_list(target_report_list)
    if env.report_processor.has_errors:
        raise LibraryError()
    com_cmd: AllSameDataMixin = GetCorosyncOnlineTargets(
        env.report_processor,
        skip_offline_targets=reports.codes.SKIP_OFFLINE_NODES in force_flags,
    )
    com_cmd.set_targets(cluster_nodes_target_list)
    online_corosync_target_list = run_and_raise(
        env.get_node_communicator(), com_cmd
    )
    com_cmd = Unfence(env.report_processor, sorted(set_device_list))
    com_cmd.set_targets(online_corosync_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    env.push_cib()
Пример #17
0
def config_sync(
    env: LibraryEnvironment, instance_name=None, skip_offline_nodes=False,
):
    """
    Send specified local booth configuration to all nodes in the local cluster.

    env
    string instance_name -- booth instance name
    skip_offline_nodes -- if True offline nodes will be skipped
    """
    report_processor = env.report_processor
    booth_env = env.get_booth_env(instance_name)
    if not env.is_cib_live:
        raise LibraryError(
            ReportItem.error(
                reports.messages.LiveEnvironmentRequired([file_type_codes.CIB])
            )
        )

    cluster_nodes_names, report_list = get_existing_nodes_names(
        env.get_corosync_conf()
    )
    if not cluster_nodes_names:
        report_list.append(
            ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined())
        )
    report_processor.report_list(report_list)

    try:
        booth_conf_data = booth_env.config.read_raw()
        booth_conf = booth_env.config.raw_to_facade(booth_conf_data)
        if isinstance(booth_env.config.raw_file, GhostFile):
            authfile_data = booth_env.key.read_raw()
            authfile_path = booth_conf.get_authfile()
            authfile_name = (
                os.path.basename(authfile_path) if authfile_path else None
            )
        else:
            (
                authfile_name,
                authfile_data,
                authfile_report_list,
            ) = config_files.get_authfile_name_and_data(booth_conf)
            report_processor.report_list(authfile_report_list)
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    except ParserErrorException as e:
        report_processor.report_list(
            booth_env.config.parser_exception_to_report_list(e)
        )
    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = BoothSendConfig(
        env.report_processor,
        booth_env.instance_name,
        booth_conf_data,
        authfile=authfile_name,
        authfile_data=authfile_data,
        skip_offline_targets=skip_offline_nodes,
    )
    com_cmd.set_targets(
        env.get_node_target_factory().get_target_list(
            cluster_nodes_names, skip_non_existing=skip_offline_nodes,
        )
    )
    run_and_raise(env.get_node_communicator(), com_cmd)
Пример #18
0
def synchronize_ssl_certificate(env: LibraryEnvironment, skip_offline=False):
    """
    Send the local pcsd SSL cert and key to all full nodes in the local cluster.

    Consider the pcs Web UI is accessed via an IP running as a resource in the
    cluster. When the IP is moved, the user's browser connects to the new node
    and we want it to get the same certificate to make the transition a
    seamless experience (otherwise the browser display a warning that the
    certificate has changed).
    Using pcsd Web UI on remote and guest nodes is not supported (pcs/pcsd
    depends on the corosanc.conf file being present on the local node) so we
    send the cert only to corossync (== full stack) nodes.
    """
    report_processor = env.report_processor
    target_factory = env.get_node_target_factory()
    cluster_nodes_names, report_list = get_existing_nodes_names(
        env.get_corosync_conf())
    if not cluster_nodes_names:
        report_list.append(
            ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined()))
    report_processor.report_list(report_list)

    try:
        with open(settings.pcsd_cert_location, "r") as file:
            ssl_cert = file.read()
    except EnvironmentError as e:
        report_processor.report(
            ReportItem.error(
                reports.messages.FileIoError(
                    file_type_codes.PCSD_SSL_CERT,
                    RawFileError.ACTION_READ,
                    format_environment_error(e),
                    file_path=settings.pcsd_cert_location,
                )))
    try:
        with open(settings.pcsd_key_location, "r") as file:
            ssl_key = file.read()
    except EnvironmentError as e:
        report_processor.report(
            ReportItem.error(
                reports.messages.FileIoError(
                    file_type_codes.PCSD_SSL_KEY,
                    RawFileError.ACTION_READ,
                    format_environment_error(e),
                    file_path=settings.pcsd_key_location,
                )))

    (
        target_report_list,
        target_list,
    ) = target_factory.get_target_list_with_reports(
        cluster_nodes_names, skip_non_existing=skip_offline)
    report_processor.report_list(target_report_list)

    if report_processor.has_errors:
        raise LibraryError()

    env.report_processor.report(
        ReportItem.info(
            reports.messages.PcsdSslCertAndKeyDistributionStarted(
                sorted([target.label for target in target_list]))))

    com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)
Пример #19
0
def location_add(lib, argv, modifiers, skip_score_and_node_check=False):
    """
    Options:
      * --force - allow unknown options, allow constraint for any resource type
      * -f - CIB file
    """
    del lib
    modifiers.ensure_only_supported("--force", "-f")
    if len(argv) < 4:
        raise CmdLineInputError()

    constraint_id = argv.pop(0)
    rsc_type, rsc_value = parse_args.parse_typed_arg(
        argv.pop(0), [RESOURCE_TYPE_RESOURCE, RESOURCE_TYPE_REGEXP],
        RESOURCE_TYPE_RESOURCE)
    node = argv.pop(0)
    score = argv.pop(0)
    options = []
    # For now we only allow setting resource-discovery
    if argv:
        for arg in argv:
            if '=' in arg:
                options.append(arg.split('=', 1))
            else:
                raise CmdLineInputError(f"bad option '{arg}'")
            if (options[-1][0] != "resource-discovery"
                    and not modifiers.get("--force")):
                utils.err("bad option '%s', use --force to override" %
                          options[-1][0])

    # Verify that specified node exists in the cluster and score is valid
    if not skip_score_and_node_check:
        if modifiers.is_specified("-f") or modifiers.get("--force"):
            warn(LOCATION_NODE_VALIDATION_SKIP_MSG)
        else:
            lib_env = utils.get_lib_env()
            existing_nodes, report_list = get_existing_nodes_names(
                corosync_conf=lib_env.get_corosync_conf(),
                cib=lib_env.get_cib(),
            )
            report_list = _verify_node_name(node, existing_nodes)
            if report_list:
                process_library_reports(report_list)
        _verify_score(score)

    id_valid, id_error = utils.validate_xml_id(constraint_id, 'constraint id')
    if not id_valid:
        utils.err(id_error)

    required_version = None
    if [x for x in options if x[0] == "resource-discovery"]:
        required_version = 2, 2, 0
    if rsc_type == RESOURCE_TYPE_REGEXP:
        required_version = 2, 6, 0

    if required_version:
        dom = utils.cluster_upgrade_to_version(required_version)
    else:
        dom = utils.get_cib_dom()

    if rsc_type == RESOURCE_TYPE_RESOURCE:
        rsc_valid, rsc_error, dummy_correct_id = (
            utils.validate_constraint_resource(dom, rsc_value))
        if not rsc_valid:
            utils.err(rsc_error)

    # Verify current constraint doesn't already exist
    # If it does we replace it with the new constraint
    dummy_dom, constraintsElement = getCurrentConstraints(dom)
    elementsToRemove = []
    # If the id matches, or the rsc & node match, then we replace/remove
    for rsc_loc in constraintsElement.getElementsByTagName('rsc_location'):
        # pylint: disable=too-many-boolean-expressions
        if (rsc_loc.getAttribute("id") == constraint_id
                or (rsc_loc.getAttribute("node") == node and
                    ((RESOURCE_TYPE_RESOURCE == rsc_type
                      and rsc_loc.getAttribute("rsc") == rsc_value) or
                     (RESOURCE_TYPE_REGEXP == rsc_type
                      and rsc_loc.getAttribute("rsc-pattern") == rsc_value)))):
            elementsToRemove.append(rsc_loc)
    for etr in elementsToRemove:
        constraintsElement.removeChild(etr)

    element = dom.createElement("rsc_location")
    element.setAttribute("id", constraint_id)
    if rsc_type == RESOURCE_TYPE_RESOURCE:
        element.setAttribute("rsc", rsc_value)
    elif rsc_type == RESOURCE_TYPE_REGEXP:
        element.setAttribute("rsc-pattern", rsc_value)
    element.setAttribute("node", node)
    element.setAttribute("score", score)
    for option in options:
        element.setAttribute(option[0], option[1])
    constraintsElement.appendChild(element)

    utils.replace_cib_configuration(dom)
Пример #20
0
def enable_sbd(
    lib_env, default_watchdog, watchdog_dict, sbd_options,
    default_device_list=None, node_device_dict=None, allow_unknown_opts=False,
    ignore_offline_nodes=False, no_watchdog_validation=False,
    allow_invalid_option_values=False,
):
    # pylint: disable=too-many-arguments, too-many-locals
    """
    Enable SBD on all nodes in cluster.

    lib_env -- LibraryEnvironment
    default_watchdog -- watchdog for nodes which are not specified in
        watchdog_dict. Uses default value from settings if None.
    watchdog_dict -- dictionary with node names as keys and watchdog path
        as value
    sbd_options -- dictionary in format: <SBD config option>: <value>
    default_device_list -- list of devices for all nodes
    node_device_dict -- dictionary with node names as keys and list of devices
        as value
    allow_unknown_opts -- if True, accept also unknown options.
    ignore_offline_nodes -- if True, omit offline nodes
    no_watchdog_validation -- it True, do not validate existance of a watchdog
        on the nodes
    allow_invalid_option_values -- if True, invalid values of some options will
        be treated as warning instead of errors
    """
    using_devices = not (
        default_device_list is None and node_device_dict is None
    )
    if default_device_list is None:
        default_device_list = []
    if node_device_dict is None:
        node_device_dict = {}
    if not default_watchdog:
        default_watchdog = settings.sbd_watchdog_default
    sbd_options = {opt.upper(): val for opt, val in sbd_options.items()}

    corosync_conf = lib_env.get_corosync_conf()

    node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf)
    if not node_list:
        get_nodes_report_list.append(reports.corosync_config_no_nodes_defined())
    target_list = lib_env.get_node_target_factory().get_target_list(
        node_list, skip_non_existing=ignore_offline_nodes,
    )

    full_watchdog_dict = _get_full_target_dict(
        target_list, watchdog_dict, default_watchdog
    )
    full_device_dict = _get_full_target_dict(
        target_list, node_device_dict, default_device_list
    )

    lib_env.report_processor.process_list(
        get_nodes_report_list
        +
        [
            reports.node_not_found(node)
            for node in (
                set(list(watchdog_dict.keys()) + list(node_device_dict.keys()))
                -
                set(node_list)
            )
        ]
        +
        _validate_watchdog_dict(full_watchdog_dict)
        +
        (sbd.validate_nodes_devices(full_device_dict) if using_devices else [])
        +
        _validate_sbd_options(
            sbd_options, allow_unknown_opts, allow_invalid_option_values
        )
    )

    com_cmd = GetOnlineTargets(
        lib_env.report_processor, ignore_offline_targets=ignore_offline_nodes,
    )
    com_cmd.set_targets(target_list)
    online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # check if SBD can be enabled
    if no_watchdog_validation:
        lib_env.report_processor.report(
            reports.sbd_watchdog_validation_inactive()
        )
    com_cmd = CheckSbd(lib_env.report_processor)
    for target in online_targets:
        com_cmd.add_request(
            target,
            (
                # Do not send watchdog if validation is turned off. Listing of
                # available watchdogs in pcsd may restart the machine in some
                # corner cases.
                "" if no_watchdog_validation
                else full_watchdog_dict[target.label]
            ),
            full_device_dict[target.label] if using_devices else [],
        )
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # enable ATB if needed
    if not using_devices:
        if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf):
            lib_env.report_processor.process(
                reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()
            )
            corosync_conf.set_quorum_options({"auto_tie_breaker": "1"})
            lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes)

    # distribute SBD configuration
    config = sbd.get_default_sbd_config()
    config.update(sbd_options)
    com_cmd = SetSbdConfig(lib_env.report_processor)
    for target in online_targets:
        com_cmd.add_request(
            target,
            sbd.create_sbd_config(
                config,
                target.label,
                full_watchdog_dict[target.label],
                full_device_dict[target.label]
            )
        )
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # remove cluster prop 'stonith_watchdog_timeout'
    com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor)
    com_cmd.set_targets(online_targets)
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # enable SBD service an all nodes
    com_cmd = EnableSbdService(lib_env.report_processor)
    com_cmd.set_targets(online_targets)
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    lib_env.report_processor.process(
        reports.cluster_restart_required_to_apply_changes()
    )
Пример #21
0
def add_device(
    lib_env: LibraryEnvironment,
    model,
    model_options,
    generic_options,
    heuristics_options,
    force_model=False,
    force_options=False,
    skip_offline_nodes=False,
):
    # pylint: disable=too-many-locals
    """
    Add a quorum device to a cluster, distribute and reload configs if live

    string model -- quorum device model
    dict model_options -- model specific options
    dict generic_options -- generic quorum device options
    dict heuristics_options -- heuristics options
    bool force_model -- continue even if the model is not valid
    bool force_options -- continue even if options are not valid
    bool skip_offline_nodes -- continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if cfg.has_quorum_device():
        raise LibraryError(
            ReportItem.error(reports.messages.QdeviceAlreadyDefined()))

    report_processor = lib_env.report_processor
    report_processor.report_list(
        corosync_conf_validators.add_quorum_device(
            model,
            model_options,
            generic_options,
            heuristics_options,
            [node.nodeid for node in cfg.get_nodes()],
            force_model=force_model,
            force_options=force_options,
        ))

    if lib_env.is_corosync_conf_live:
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True,
        )
        report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    cfg.add_quorum_device(
        model,
        model_options,
        generic_options,
        heuristics_options,
    )
    if cfg.is_quorum_device_heuristics_enabled_with_no_exec():
        lib_env.report_processor.report(
            ReportItem.warning(
                reports.messages.CorosyncQuorumHeuristicsEnabledWithNoExec()))

    # First setup certificates for qdevice, then send corosync.conf to nodes.
    # If anything fails, nodes will not have corosync.conf with qdevice in it,
    # so there is no effect on the cluster.
    if lib_env.is_corosync_conf_live:
        target_factory = lib_env.get_node_target_factory()
        target_list = target_factory.get_target_list(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        )
        # Do model specific configuration.
        # If the model is not known to pcs and was forced, do not configure
        # anything else than corosync.conf, as we do not know what to do
        # anyway.
        if model == "net":
            qdevice_net.set_up_client_certificates(
                lib_env.cmd_runner(),
                lib_env.report_processor,
                lib_env.communicator_factory,
                # We are sure the "host" key is there, it has been validated
                # above.
                target_factory.get_target_from_hostname(model_options["host"]),
                cfg.get_cluster_name(),
                target_list,
                skip_offline_nodes,
            )

        lib_env.report_processor.report(
            ReportItem.info(
                reports.messages.ServiceActionStarted(
                    reports.const.SERVICE_ACTION_ENABLE, "corosync-qdevice")))
        com_cmd = qdevice_com.Enable(lib_env.report_processor,
                                     skip_offline_nodes)
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # everything set up, it's safe to tell the nodes to use qdevice
    lib_env.push_corosync_conf(cfg, skip_offline_nodes)

    # Now, when corosync.conf has been reloaded, we can start qdevice service.
    if lib_env.is_corosync_conf_live:
        lib_env.report_processor.report(
            ReportItem.info(
                reports.messages.ServiceActionStarted(
                    reports.const.SERVICE_ACTION_START, "corosync-qdevice")))
        com_cmd_start = qdevice_com.Start(lib_env.report_processor,
                                          skip_offline_nodes)
        com_cmd_start.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd_start)
Пример #22
0
def config_restore_remote(infile_name, infile_obj):
    """
    Commandline options:
      * --request-timeout - timeout for HTTP requests
    """
    extracted = {
        "version.txt": "",
        "corosync.conf": "",
    }
    try:
        tarball = tarfile.open(infile_name, "r|*", infile_obj)
        while True:
            # next(tarball) does not work in python2.6
            tar_member_info = tarball.next()
            if tar_member_info is None:
                break
            if tar_member_info.name in extracted:
                tar_member = tarball.extractfile(tar_member_info)
                extracted[tar_member_info.name] = tar_member.read()
                tar_member.close()
        tarball.close()
    except (tarfile.TarError, EnvironmentError) as e:
        utils.err("unable to read the tarball: %s" % e)

    config_backup_check_version(extracted["version.txt"])

    node_list, report_list = get_existing_nodes_names(
        utils.get_corosync_conf_facade(
            conf_text=extracted["corosync.conf"].decode("utf-8")))
    if report_list:
        process_library_reports(report_list)
    if not node_list:
        utils.err("no nodes found in the tarball")

    err_msgs = []
    for node in node_list:
        try:
            retval, output = utils.checkStatus(node)
            if retval != 0:
                err_msgs.append(output)
                continue
            _status = json.loads(output)
            if (_status["corosync"] or _status["pacemaker"] or
                    # not supported by older pcsd, do not fail if not present
                    _status.get("pacemaker_remote", False)):
                err_msgs.append(
                    "Cluster is currently running on node %s. You need to stop "
                    "the cluster in order to restore the configuration." %
                    node)
                continue
        except (ValueError, NameError, LookupError):
            err_msgs.append("unable to determine status of the node %s" % node)
    if err_msgs:
        for msg in err_msgs:
            utils.err(msg, False)
        sys.exit(1)

    # Temporarily disable config files syncing thread in pcsd so it will not
    # rewrite restored files. 10 minutes should be enough time to restore.
    # If node returns HTTP 404 it does not support config syncing at all.
    for node in node_list:
        retval, output = utils.pauseConfigSyncing(node, 10 * 60)
        if not (retval == 0 or "(HTTP error: 404)" in output):
            utils.err(output)

    if infile_obj:
        infile_obj.seek(0)
        tarball_data = infile_obj.read()
    else:
        with open(infile_name, "rb") as tarball:
            tarball_data = tarball.read()

    error_list = []
    for node in node_list:
        retval, error = utils.restoreConfig(node, tarball_data)
        if retval != 0:
            error_list.append(error)
    if error_list:
        utils.err("unable to restore all nodes\n" + "\n".join(error_list))
Пример #23
0
def config_restore_remote(infile_name, infile_obj):
    """
    Commandline options:
      * --request-timeout - timeout for HTTP requests
    """
    extracted = {
        "version.txt": "",
        "corosync.conf": "",
    }
    try:
        tarball = tarfile.open(infile_name, "r|*", infile_obj)
        while True:
            # next(tarball) does not work in python2.6
            tar_member_info = tarball.next()
            if tar_member_info is None:
                break
            if tar_member_info.name in extracted:
                tar_member = tarball.extractfile(tar_member_info)
                extracted[tar_member_info.name] = tar_member.read()
                tar_member.close()
        tarball.close()
    except (tarfile.TarError, EnvironmentError) as e:
        utils.err("unable to read the tarball: %s" % e)

    config_backup_check_version(extracted["version.txt"])

    node_list, report_list = get_existing_nodes_names(
        utils.get_corosync_conf_facade(
            conf_text=extracted["corosync.conf"].decode("utf-8")
        )
    )
    if report_list:
        utils.process_library_reports(report_list)
    if not node_list:
        utils.err("no nodes found in the tarball")

    err_msgs = []
    for node in node_list:
        try:
            retval, output = utils.checkStatus(node)
            if retval != 0:
                err_msgs.append(output)
                continue
            _status = json.loads(output)
            if (
                _status["corosync"]
                or
                _status["pacemaker"]
                or
                # not supported by older pcsd, do not fail if not present
                _status.get("pacemaker_remote", False)
            ):
                err_msgs.append(
                    "Cluster is currently running on node %s. You need to stop "
                        "the cluster in order to restore the configuration."
                    % node
                )
                continue
        except (ValueError, NameError, LookupError):
            err_msgs.append("unable to determine status of the node %s" % node)
    if err_msgs:
        for msg in err_msgs:
            utils.err(msg, False)
        sys.exit(1)

    # Temporarily disable config files syncing thread in pcsd so it will not
    # rewrite restored files. 10 minutes should be enough time to restore.
    # If node returns HTTP 404 it does not support config syncing at all.
    for node in node_list:
        retval, output = utils.pauseConfigSyncing(node, 10 * 60)
        if not (retval == 0 or "(HTTP error: 404)" in output):
            utils.err(output)

    if infile_obj:
        infile_obj.seek(0)
        tarball_data = infile_obj.read()
    else:
        with open(infile_name, "rb") as tarball:
            tarball_data = tarball.read()

    error_list = []
    for node in node_list:
        retval, error = utils.restoreConfig(node, tarball_data)
        if retval != 0:
            error_list.append(error)
    if error_list:
        utils.err("unable to restore all nodes\n" + "\n".join(error_list))
Пример #24
0
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None:
    """
    Set up disaster recovery with the local cluster being the primary site

    env
    node_name -- a known host from the recovery site
    """
    if env.ghost_file_codes:
        raise LibraryError(
            ReportItem.error(
                reports.messages.LiveEnvironmentRequired(
                    env.ghost_file_codes)))
    report_processor = env.report_processor
    dr_env = env.get_dr_env()
    if dr_env.config.raw_file.exists():
        report_processor.report(
            ReportItem.error(reports.messages.DrConfigAlreadyExist()))
    target_factory = env.get_node_target_factory()

    local_nodes, report_list = get_existing_nodes_names(
        env.get_corosync_conf(), error_on_missing_name=True)
    report_processor.report_list(report_list)

    if node_name in local_nodes:
        report_processor.report(
            ReportItem.error(reports.messages.NodeInLocalCluster(node_name)))

    report_list, local_targets = target_factory.get_target_list_with_reports(
        local_nodes, allow_skip=False, report_none_host_found=False)
    report_processor.report_list(report_list)

    report_list, remote_targets = target_factory.get_target_list_with_reports(
        [node_name], allow_skip=False, report_none_host_found=False)
    report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = GetCorosyncConf(env.report_processor)
    com_cmd.set_targets(remote_targets)
    remote_cluster_nodes, report_list = get_existing_nodes_names(
        CorosyncConfigFacade.from_string(
            run_and_raise(env.get_node_communicator(), com_cmd)),
        error_on_missing_name=True,
    )
    if report_processor.report_list(report_list).has_errors:
        raise LibraryError()

    # ensure we have tokens for all nodes of remote cluster
    report_list, remote_targets = target_factory.get_target_list_with_reports(
        remote_cluster_nodes, allow_skip=False, report_none_host_found=False)
    if report_processor.report_list(report_list).has_errors:
        raise LibraryError()
    dr_config_exporter = get_file_toolbox(
        file_type_codes.PCS_DR_CONFIG).exporter
    # create dr config for remote cluster
    remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY)
    remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes)
    # send config to all node of remote cluster
    distribute_file_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.pcs_dr_config_file(
            dr_config_exporter.export(remote_dr_cfg.config)),
    )
    distribute_file_cmd.set_targets(remote_targets)
    run_and_raise(env.get_node_communicator(), distribute_file_cmd)
    # create new dr config, with local cluster as primary site
    local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY)
    local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes)
    distribute_file_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.pcs_dr_config_file(
            dr_config_exporter.export(local_dr_cfg.config)),
    )
    distribute_file_cmd.set_targets(local_targets)
    run_and_raise(env.get_node_communicator(), distribute_file_cmd)
Пример #25
0
Файл: sbd.py Проект: nrwahl2/pcs
def enable_sbd(
    lib_env,
    default_watchdog,
    watchdog_dict,
    sbd_options,
    default_device_list=None,
    node_device_dict=None,
    allow_unknown_opts=False,
    ignore_offline_nodes=False,
    no_watchdog_validation=False,
    allow_invalid_option_values=False,
):
    # pylint: disable=too-many-arguments, too-many-locals
    """
    Enable SBD on all nodes in cluster.

    lib_env -- LibraryEnvironment
    default_watchdog -- watchdog for nodes which are not specified in
        watchdog_dict. Uses default value from settings if None.
    watchdog_dict -- dictionary with node names as keys and watchdog path
        as value
    sbd_options -- dictionary in format: <SBD config option>: <value>
    default_device_list -- list of devices for all nodes
    node_device_dict -- dictionary with node names as keys and list of devices
        as value
    allow_unknown_opts -- if True, accept also unknown options.
    ignore_offline_nodes -- if True, omit offline nodes
    no_watchdog_validation -- it True, do not validate existance of a watchdog
        on the nodes
    allow_invalid_option_values -- if True, invalid values of some options will
        be treated as warning instead of errors
    """
    using_devices = not (default_device_list is None
                         and node_device_dict is None)
    if default_device_list is None:
        default_device_list = []
    if node_device_dict is None:
        node_device_dict = {}
    if not default_watchdog:
        default_watchdog = settings.sbd_watchdog_default
    sbd_options = {opt.upper(): val for opt, val in sbd_options.items()}

    corosync_conf = lib_env.get_corosync_conf()

    node_list, get_nodes_report_list = get_existing_nodes_names(corosync_conf)
    if not node_list:
        get_nodes_report_list.append(
            ReportItem.error(reports.messages.CorosyncConfigNoNodesDefined()))
    target_list = lib_env.get_node_target_factory().get_target_list(
        node_list,
        skip_non_existing=ignore_offline_nodes,
    )

    full_watchdog_dict = _get_full_target_dict(target_list, watchdog_dict,
                                               default_watchdog)
    full_device_dict = _get_full_target_dict(target_list, node_device_dict,
                                             default_device_list)

    if lib_env.report_processor.report_list(
            get_nodes_report_list +
        [
            ReportItem.error(reports.messages.NodeNotFound(node)) for node in
            (set(list(watchdog_dict.keys()) + list(node_device_dict.keys())) -
             set(node_list))
        ] + _validate_watchdog_dict(full_watchdog_dict) +
        (sbd.validate_nodes_devices(full_device_dict) if using_devices else []
         ) + _validate_sbd_options(sbd_options, allow_unknown_opts,
                                   allow_invalid_option_values)).has_errors:
        raise LibraryError()

    com_cmd = GetOnlineTargets(
        lib_env.report_processor,
        ignore_offline_targets=ignore_offline_nodes,
    )
    com_cmd.set_targets(target_list)
    online_targets = run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # check if SBD can be enabled
    if no_watchdog_validation:
        lib_env.report_processor.report(
            ReportItem.warning(
                reports.messages.SbdWatchdogValidationInactive()))
    com_cmd = CheckSbd(lib_env.report_processor)
    for target in online_targets:
        com_cmd.add_request(
            target,
            (
                # Do not send watchdog if validation is turned off. Listing of
                # available watchdogs in pcsd may restart the machine in some
                # corner cases.
                "" if no_watchdog_validation else
                full_watchdog_dict[target.label]),
            full_device_dict[target.label] if using_devices else [],
        )
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # enable ATB if needed
    if not using_devices:
        if sbd.atb_has_to_be_enabled_pre_enable_check(corosync_conf):
            lib_env.report_processor.report(
                ReportItem.warning(
                    reports.messages.CorosyncQuorumAtbWillBeEnabledDueToSbd()))
            corosync_conf.set_quorum_options({"auto_tie_breaker": "1"})
            lib_env.push_corosync_conf(corosync_conf, ignore_offline_nodes)

    # distribute SBD configuration
    config = sbd.get_default_sbd_config()
    config.update(sbd_options)
    com_cmd = SetSbdConfig(lib_env.report_processor)
    for target in online_targets:
        com_cmd.add_request(
            target,
            sbd.create_sbd_config(
                config,
                target.label,
                full_watchdog_dict[target.label],
                full_device_dict[target.label],
            ),
        )
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # remove cluster prop 'stonith_watchdog_timeout'
    com_cmd = RemoveStonithWatchdogTimeout(lib_env.report_processor)
    com_cmd.set_targets(online_targets)
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # enable SBD service an all nodes
    com_cmd = EnableSbdService(lib_env.report_processor)
    com_cmd.set_targets(online_targets)
    run_and_raise(lib_env.get_node_communicator(), com_cmd)

    lib_env.report_processor.report(
        ReportItem.warning(
            reports.messages.ClusterRestartRequiredToApplyChanges()))