Exemplo n.º 1
0
def synchronize_ssl_certificate(env, skip_offline=False):
    """
    Send the local pcsd SSL cert and key to all full nodes in the local cluster.

    Consider the pcs Web UI is accessed via an IP running as a resource in the
    cluster. When the IP is moved, the user's browser connects to the new node
    and we want it to get the same certificate to make the transition a
    seamless experience (otherwise the browser display a warning that the
    certificate has changed).
    Using pcsd Web UI on remote and guest nodes is not supported (pcs/pcsd
    depends on the corosanc.conf file being present on the local node) so we
    send the cert only to corossync (== full stack) nodes.
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    cluster_nodes_names, report_list = get_existing_nodes_names(
        env.get_corosync_conf())
    if not cluster_nodes_names:
        report_list.append(reports.corosync_config_no_nodes_defined())
    report_processor.report_list(report_list)

    try:
        with open(settings.pcsd_cert_location, "r") as file:
            ssl_cert = file.read()
    except EnvironmentError as e:
        report_processor.report(
            reports.file_io_error(
                env_file_role_codes.PCSD_SSL_CERT,
                file_path=settings.pcsd_cert_location,
                reason=format_environment_error(e),
                operation="read",
            ))
    try:
        with open(settings.pcsd_key_location, "r") as file:
            ssl_key = file.read()
    except EnvironmentError as e:
        report_processor.report(
            reports.file_io_error(
                env_file_role_codes.PCSD_SSL_KEY,
                file_path=settings.pcsd_key_location,
                reason=format_environment_error(e),
                operation="read",
            ))

    target_report_list, target_list = (
        target_factory.get_target_list_with_reports(
            cluster_nodes_names, skip_non_existing=skip_offline))
    report_processor.report_list(target_report_list)

    if report_processor.has_errors:
        raise LibraryError()

    env.report_processor.process(
        reports.pcsd_ssl_cert_and_key_distribution_started(
            [target.label for target in target_list]))

    com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)
Exemplo n.º 2
0
def send_all_config_to_node(
    communicator,
    reporter,
    target_list,
    rewrite_existing=False,
    skip_wrong_config=False
):
    """
    Send all booth configs from default booth config directory and theri
    authfiles to specified node.

    communicator -- NodeCommunicator
    reporter -- report processor
    target_list list -- list of targets to which configs should be delivered
    rewrite_existing -- if True rewrite existing file
    skip_wrong_config -- if True skip local configs that are unreadable
    """
    _reporter = SimpleReportProcessor(reporter)
    config_dict = booth_conf.read_configs(reporter, skip_wrong_config)
    if not config_dict:
        return

    _reporter.report(reports.booth_config_distribution_started())

    file_list = []
    for config, config_data in sorted(config_dict.items()):
        try:
            authfile_path = config_structure.get_authfile(
                config_parser.parse(config_data)
            )
            file_list.append({
                "name": config,
                "data": config_data,
                "is_authfile": False
            })
            if authfile_path:
                content = booth_conf.read_authfile(reporter, authfile_path)
                if not content:
                    continue
                file_list.append({
                    "name": os.path.basename(authfile_path),
                    "data": base64.b64encode(content).decode("utf-8"),
                    "is_authfile": True
                })
        except LibraryError:
            _reporter.report(reports.booth_skipping_config(
                config, "unable to parse config"
            ))

    com_cmd = BoothSaveFiles(
        _reporter, file_list, rewrite_existing=rewrite_existing
    )
    com_cmd.set_targets(target_list)
    run(communicator, com_cmd)

    if _reporter.has_errors:
        raise LibraryError()
Exemplo n.º 3
0
def remove_device(lib_env, skip_offline_nodes=False):
    """
    Stop using quorum device, distribute and reload configs if live
    skip_offline_nodes continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if not cfg.has_quorum_device():
        raise LibraryError(reports.qdevice_not_defined())
    model = cfg.get_quorum_device_model()
    cfg.remove_quorum_device()

    if lib_env.is_corosync_conf_live:
        report_processor = SimpleReportProcessor(lib_env.report_processor)
        # get nodes for communication
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True)
        report_processor.report_list(report_list)
        if report_processor.has_errors:
            raise LibraryError()
        target_list = lib_env.get_node_target_factory().get_target_list(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        )
        # fix quorum options for SBD to work properly
        if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg):
            lib_env.report_processor.process(
                reports.corosync_quorum_atb_will_be_enabled_due_to_sbd())
            cfg.set_quorum_options({"auto_tie_breaker": "1"})

        # disable qdevice
        lib_env.report_processor.process(
            reports.service_disable_started("corosync-qdevice"))
        com_cmd = qdevice_com.Disable(lib_env.report_processor,
                                      skip_offline_nodes)
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
        # stop qdevice
        lib_env.report_processor.process(
            reports.service_stop_started("corosync-qdevice"))
        com_cmd = qdevice_com.Stop(lib_env.report_processor,
                                   skip_offline_nodes)
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
        # handle model specific configuration
        if model == "net":
            lib_env.report_processor.process(
                reports.qdevice_certificate_removal_started())
            com_cmd = qdevice_net_com.ClientDestroy(lib_env.report_processor,
                                                    skip_offline_nodes)
            com_cmd.set_targets(target_list)
            run_and_raise(lib_env.get_node_communicator(), com_cmd)

    lib_env.push_corosync_conf(cfg, skip_offline_nodes)
Exemplo n.º 4
0
def config_sync(env, instance_name=None, skip_offline_nodes=False):
    """
    Send specified local booth configuration to all nodes in the local cluster.

    LibraryEnvironment env
    string instance_name -- booth instance name
    skip_offline_nodes -- if True offline nodes will be skipped
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    if not env.is_cib_live:
        raise LibraryError(
            reports.live_environment_required([file_type_codes.CIB], ))

    cluster_nodes_names, report_list = get_existing_nodes_names(
        env.get_corosync_conf())
    if not cluster_nodes_names:
        report_list.append(reports.corosync_config_no_nodes_defined())
    report_processor.report_list(report_list)

    try:
        booth_conf_data = booth_env.config.read_raw()
        booth_conf = booth_env.config.raw_to_facade(booth_conf_data)
        if isinstance(booth_env.config.raw_file, GhostFile):
            authfile_data = booth_env.key.read_raw()
            authfile_path = booth_conf.get_authfile()
            authfile_name = (os.path.basename(authfile_path)
                             if authfile_path else None)
        else:
            authfile_name, authfile_data, authfile_report_list = (
                config_files.get_authfile_name_and_data(booth_conf))
            report_processor.report_list(authfile_report_list)
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    except ParserErrorException as e:
        report_processor.report_list(
            booth_env.config.parser_exception_to_report_list(e))
    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = BoothSendConfig(env.report_processor,
                              booth_env.instance_name,
                              booth_conf_data,
                              authfile=authfile_name,
                              authfile_data=authfile_data,
                              skip_offline_targets=skip_offline_nodes)
    com_cmd.set_targets(env.get_node_target_factory().get_target_list(
        cluster_nodes_names,
        skip_non_existing=skip_offline_nodes,
    ))
    run_and_raise(env.get_node_communicator(), com_cmd)
Exemplo n.º 5
0
def config_ticket_remove(env, ticket_name, instance_name=None):
    """
    remove a ticket from booth configuration

    LibraryEnvironment env
    string ticket_name -- the name of the ticket to be removed
    string instance_name -- booth instance name
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    try:
        booth_conf = booth_env.config.read_to_facade()
        report_processor.report_list(
            config_validators.remove_ticket(booth_conf, ticket_name))
        if report_processor.has_errors:
            raise LibraryError()
        booth_conf.remove_ticket(ticket_name)
        booth_env.config.write_facade(booth_conf, can_overwrite=True)
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    except ParserErrorException as e:
        report_processor.report_list(
            booth_env.config.parser_exception_to_report_list(e))
    if report_processor.has_errors:
        raise LibraryError()
Exemplo n.º 6
0
def config_ticket_add(env,
                      ticket_name,
                      options,
                      instance_name=None,
                      allow_unknown_options=False):
    """
    add a ticket to booth configuration

    LibraryEnvironment env
    string ticket_name -- the name of the ticket to be created
    dict options -- options for the ticket
    string instance_name -- booth instance name
    bool allow_unknown_options -- allow using options unknown to pcs
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    try:
        booth_conf = booth_env.config.read_to_facade()
        report_processor.report_list(
            config_validators.add_ticket(
                booth_conf,
                ticket_name,
                options,
                allow_unknown_options=allow_unknown_options))
        if report_processor.has_errors:
            raise LibraryError()
        booth_conf.add_ticket(ticket_name, options)
        booth_env.config.write_facade(booth_conf, can_overwrite=True)
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    except ParserErrorException as e:
        report_processor.report_list(
            booth_env.config.parser_exception_to_report_list(e))
    if report_processor.has_errors:
        raise LibraryError()
Exemplo n.º 7
0
def config_setup(env,
                 site_list,
                 arbitrator_list,
                 instance_name=None,
                 overwrite_existing=False):
    """
    create booth configuration

    LibraryEnvironment env
    list site_list -- site adresses of multisite
    list arbitrator_list -- arbitrator adresses of multisite
    string instance_name -- booth instance name
    bool overwrite_existing -- allow overwriting existing files
    """
    instance_name = instance_name or constants.DEFAULT_INSTANCE_NAME
    report_processor = SimpleReportProcessor(env.report_processor)

    report_processor.report_list(
        config_validators.check_instance_name(instance_name))
    report_processor.report_list(
        config_validators.create(site_list, arbitrator_list))
    if report_processor.has_errors:
        raise LibraryError()

    booth_env = env.get_booth_env(instance_name)

    booth_conf = booth_env.create_facade(site_list, arbitrator_list)
    booth_conf.set_authfile(booth_env.key_path)

    report_creator = reports.get_problem_creator(
        force_code=report_codes.FORCE_FILE_OVERWRITE,
        is_forced=overwrite_existing)
    try:
        booth_env.key.write_raw(tools.generate_binary_key(
            random_bytes_count=settings.booth_authkey_bytes),
                                can_overwrite=overwrite_existing)
        booth_env.config.write_facade(booth_conf,
                                      can_overwrite=overwrite_existing)
    except FileAlreadyExists as e:
        report_processor.report(
            report_creator(
                reports.file_already_exists,
                e.metadata.file_type_code,
                e.metadata.path,
            ))
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    if report_processor.has_errors:
        raise LibraryError()
Exemplo n.º 8
0
def remove_from_cluster(env,
                        resource_remove,
                        instance_name=None,
                        allow_remove_multiple=False):
    """
    Remove group with ip resource and booth resource

    LibraryEnvironment env -- provides all for communication with externals
    function resource_remove -- provisional hack til resources are moved to lib
    string instance_name -- booth instance name
    bool allow_remove_multiple -- remove all resources if more than one found
    """
    # TODO resource_remove is provisional hack til resources are moved to lib
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    # This command does not work with booth config files at all, let's reject
    # them then.
    _ensure_live_booth_env(booth_env)

    resource.get_remover(resource_remove)(
        _find_resource_elements_for_operation(
            report_processor,
            get_resources(env.get_cib()),
            booth_env,
            allow_remove_multiple,
        ))
Exemplo n.º 9
0
def pull_config(env, node_name, instance_name=None):
    """
    Get config from specified node and save it on local system. It will
    rewrite existing files.

    LibraryEnvironment env
    string node_name -- name of the node from which the config should be fetched
    string instance_name -- booth instance name
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    instance_name = booth_env.instance_name
    _ensure_live_env(env, booth_env)

    env.report_processor.process(
        booth_reports.booth_fetching_config_from_node_started(
            node_name, instance_name))
    com_cmd = BoothGetConfig(env.report_processor, instance_name)
    com_cmd.set_targets(
        [env.get_node_target_factory().get_target_from_hostname(node_name)])
    # pylint: disable=unsubscriptable-object
    # In general, pylint is right. And it cannot know in this case code is OK.
    # It is covered by tests.
    output = run_and_raise(env.get_node_communicator(), com_cmd)[0][1]
    try:
        # TODO adapt to new file transfer framework once it is written
        if (output["authfile"]["name"] is not None
                and output["authfile"]["data"]):
            authfile_name = output["authfile"]["name"]
            report_list = config_validators.check_instance_name(authfile_name)
            if report_list:
                raise LibraryError(*report_list)
            booth_key = FileInstance.for_booth_key(authfile_name)
            booth_key.write_raw(base64.b64decode(
                output["authfile"]["data"].encode("utf-8")),
                                can_overwrite=True)
        booth_env.config.write_raw(output["config"]["data"].encode("utf-8"),
                                   can_overwrite=True)
        env.report_processor.process(
            booth_reports.booth_config_accepted_by_node(
                name_list=[instance_name]))
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    except KeyError:
        raise LibraryError(reports.invalid_response_format(node_name))
    if report_processor.has_errors:
        raise LibraryError()
Exemplo n.º 10
0
def get_config(env: LibraryEnvironment) -> Mapping[str, Any]:
    """
    Return local disaster recovery config

    env -- LibraryEnvironment
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    report_list, dr_config = _load_dr_config(env.get_dr_env().config)
    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    return DrConfigDto(DrConfigSiteDto(dr_config.local_role, []), [
        DrConfigSiteDto(
            site.role, [DrConfigNodeDto(name) for name in site.node_name_list])
        for site in dr_config.get_remote_site_list()
    ]).to_dict()
Exemplo n.º 11
0
def send_all_config_to_node(communicator,
                            reporter,
                            target_list,
                            rewrite_existing=False,
                            skip_wrong_config=False):
    """
    Send all booth configs from default booth config directory and theri
    authfiles to specified node.

    communicator -- NodeCommunicator
    reporter -- report processor
    target_list list -- list of targets to which configs should be delivered
    rewrite_existing -- if True rewrite existing file
    skip_wrong_config -- if True skip local configs that are unreadable
    """
    _reporter = SimpleReportProcessor(reporter)
    config_dict = booth_conf.read_configs(reporter, skip_wrong_config)
    if not config_dict:
        return

    _reporter.report(reports.booth_config_distribution_started())

    file_list = []
    for config, config_data in sorted(config_dict.items()):
        try:
            authfile_path = config_structure.get_authfile(
                config_parser.parse(config_data))
            file_list.append({
                "name": config,
                "data": config_data,
                "is_authfile": False
            })
            if authfile_path:
                content = booth_conf.read_authfile(reporter, authfile_path)
                if not content:
                    continue
                file_list.append({
                    "name":
                    os.path.basename(authfile_path),
                    "data":
                    base64.b64encode(content).decode("utf-8"),
                    "is_authfile":
                    True
                })
        except LibraryError:
            _reporter.report(
                reports.booth_skipping_config(config,
                                              "unable to parse config"))

    com_cmd = BoothSaveFiles(_reporter,
                             file_list,
                             rewrite_existing=rewrite_existing)
    com_cmd.set_targets(target_list)
    run(communicator, com_cmd)

    if _reporter.has_errors:
        raise LibraryError()
Exemplo n.º 12
0
def destroy(env: LibraryEnvironment, force_flags: Container[str] = ()) -> None:
    """
    Destroy disaster-recovery configuration on all sites
    """
    if env.ghost_file_codes:
        raise LibraryError(
            reports.live_environment_required(env.ghost_file_codes))

    report_processor = SimpleReportProcessor(env.report_processor)
    skip_offline = report_codes.SKIP_OFFLINE_NODES in force_flags

    report_list, dr_config = _load_dr_config(env.get_dr_env().config)
    report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    local_nodes, report_list = get_existing_nodes_names(
        env.get_corosync_conf())
    report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    remote_nodes: List[str] = []
    for conf_remote_site in dr_config.get_remote_site_list():
        remote_nodes.extend(conf_remote_site.node_name_list)

    target_factory = env.get_node_target_factory()
    report_list, targets = target_factory.get_target_list_with_reports(
        remote_nodes + local_nodes,
        skip_non_existing=skip_offline,
    )
    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = RemoveFilesWithoutForces(
        env.report_processor,
        {
            "pcs disaster-recovery config": {
                "type": "pcs_disaster_recovery_conf",
            },
        },
    )
    com_cmd.set_targets(targets)
    run_and_raise(env.get_node_communicator(), com_cmd)
Exemplo n.º 13
0
def config_text(env, instance_name=None, node_name=None):
    """
    get configuration in raw format

    string instance_name -- booth instance name
    string node_name -- get the config from specified node or local host if None
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    instance_name = booth_env.instance_name
    # It does not make any sense for the cli to read a ghost file and send it
    # to lib so that the lib could return it unchanged to cli. Just use 'cat'.
    # When node_name is specified, using ghost files doesn't make any sense
    # either.
    _ensure_live_env(env, booth_env)

    if node_name is None:
        try:
            return booth_env.config.read_raw()
        except RawFileError as e:
            report_processor.report(raw_file_error_report(e))
        if report_processor.has_errors:
            raise LibraryError()

    com_cmd = BoothGetConfig(env.report_processor, instance_name)
    com_cmd.set_targets(
        [env.get_node_target_factory().get_target_from_hostname(node_name)])
    # pylint: disable=unsubscriptable-object
    # In general, pylint is right. And it cannot know in this case code is OK.
    # It is covered by tests.
    remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1]
    try:
        # TODO switch to new file transfer commands (not implemented yet)
        # which send and receive configs as bytes instead of strings
        return remote_data["config"]["data"].encode("utf-8")
    except KeyError:
        raise LibraryError(reports.invalid_response_format(node_name))
Exemplo n.º 14
0
def restart(env, resource_restart, instance_name=None, allow_multiple=False):
    """
    Restart group with ip resource and booth resource

    LibraryEnvironment env -- provides all for communication with externals
    function resource_restart -- provisional hack til resources are moved to lib
    string instance_name -- booth instance name
    bool allow_remove_multiple -- remove all resources if more than one found
    """
    # TODO resource_remove is provisional hack til resources are moved to lib
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    _ensure_live_env(env, booth_env)

    for booth_element in _find_resource_elements_for_operation(
            report_processor,
            get_resources(env.get_cib()),
            booth_env,
            allow_multiple,
    ):
        resource_restart([booth_element.attrib["id"]])
Exemplo n.º 15
0
def status_all_sites_plaintext(
    env: LibraryEnvironment,
    hide_inactive_resources: bool = False,
    verbose: bool = False,
) -> List[Mapping[str, Any]]:
    """
    Return local site's and all remote sites' status as plaintext

    env -- LibraryEnvironment
    hide_inactive_resources -- if True, do not display non-running resources
    verbose -- if True, display more info
    """

    # The command does not provide an option to skip offline / unreacheable /
    # misbehaving nodes.
    # The point of such skipping is to stop a command if it is unable to make
    # changes on all nodes. The user can then decide to proceed anyway and
    # make changes on the skipped nodes later manually.
    # This command only reads from nodes so it automatically asks other nodes
    # if one is offline / misbehaving.
    class SiteData():
        local: bool
        role: DrRole
        target_list: Iterable[RequestTarget]
        status_loaded: bool
        status_plaintext: str

        def __init__(self, local, role, target_list):
            self.local = local
            self.role = role
            self.target_list = target_list
            self.status_loaded = False
            self.status_plaintext = ""

    if env.ghost_file_codes:
        raise LibraryError(
            reports.live_environment_required(env.ghost_file_codes))

    report_processor = SimpleReportProcessor(env.report_processor)
    report_list, dr_config = _load_dr_config(env.get_dr_env().config)
    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    site_data_list = []
    target_factory = env.get_node_target_factory()

    # get local nodes
    local_nodes, report_list = get_existing_nodes_names(
        env.get_corosync_conf())
    report_processor.report_list(report_list)
    report_list, local_targets = target_factory.get_target_list_with_reports(
        local_nodes,
        skip_non_existing=True,
    )
    report_processor.report_list(report_list)
    site_data_list.append(SiteData(True, dr_config.local_role, local_targets))

    # get remote sites' nodes
    for conf_remote_site in dr_config.get_remote_site_list():
        report_list, remote_targets = (
            target_factory.get_target_list_with_reports(
                conf_remote_site.node_name_list,
                skip_non_existing=True,
            ))
        report_processor.report_list(report_list)
        site_data_list.append(
            SiteData(False, conf_remote_site.role, remote_targets))
    if report_processor.has_errors:
        raise LibraryError()

    # get all statuses
    for site_data in site_data_list:
        com_cmd = GetFullClusterStatusPlaintext(
            report_processor,
            hide_inactive_resources=hide_inactive_resources,
            verbose=verbose,
        )
        com_cmd.set_targets(site_data.target_list)
        site_data.status_loaded, site_data.status_plaintext = run_com_cmd(
            env.get_node_communicator(), com_cmd)

    return [
        DrSiteStatusDto(
            site_data.local,
            site_data.role,
            site_data.status_plaintext,
            site_data.status_loaded,
        ).to_dict() for site_data in site_data_list
    ]
Exemplo n.º 16
0
def remove_device(lib_env, skip_offline_nodes=False):
    """
    Stop using quorum device, distribute and reload configs if live
    skip_offline_nodes continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if not cfg.has_quorum_device():
        raise LibraryError(reports.qdevice_not_defined())
    model = cfg.get_quorum_device_model()
    cfg.remove_quorum_device()

    if lib_env.is_corosync_conf_live:
        report_processor = SimpleReportProcessor(lib_env.report_processor)
        # get nodes for communication
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True
        )
        report_processor.report_list(report_list)
        if report_processor.has_errors:
            raise LibraryError()
        target_list = lib_env.get_node_target_factory().get_target_list(
            cluster_nodes_names, skip_non_existing=skip_offline_nodes,
        )
        # fix quorum options for SBD to work properly
        if sbd.atb_has_to_be_enabled(lib_env.cmd_runner(), cfg):
            lib_env.report_processor.process(
                reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()
            )
            cfg.set_quorum_options({"auto_tie_breaker": "1"})

        # disable qdevice
        lib_env.report_processor.process(
            reports.service_disable_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Disable(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
        # stop qdevice
        lib_env.report_processor.process(
            reports.service_stop_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Stop(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
        # handle model specific configuration
        if model == "net":
            lib_env.report_processor.process(
                reports.qdevice_certificate_removal_started()
            )
            com_cmd = qdevice_net_com.ClientDestroy(
                lib_env.report_processor, skip_offline_nodes
            )
            com_cmd.set_targets(target_list)
            run_and_raise(lib_env.get_node_communicator(), com_cmd)

    lib_env.push_corosync_conf(cfg, skip_offline_nodes)
Exemplo n.º 17
0
def add_device(
    lib_env, model, model_options, generic_options, heuristics_options,
    force_model=False, force_options=False, skip_offline_nodes=False
):
    """
    Add a quorum device to a cluster, distribute and reload configs if live

    string model -- quorum device model
    dict model_options -- model specific options
    dict generic_options -- generic quorum device options
    dict heuristics_options -- heuristics options
    bool force_model -- continue even if the model is not valid
    bool force_options -- continue even if options are not valid
    bool skip_offline_nodes -- continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if cfg.has_quorum_device():
        raise LibraryError(reports.qdevice_already_defined())

    report_processor = SimpleReportProcessor(lib_env.report_processor)
    report_processor.report_list(
        corosync_conf_validators.add_quorum_device(
            model,
            model_options,
            generic_options,
            heuristics_options,
            [node.nodeid for node in cfg.get_nodes()],
            force_model=force_model,
            force_options=force_options
        )
    )

    if lib_env.is_corosync_conf_live:
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True
        )
        report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    cfg.add_quorum_device(
        model,
        model_options,
        generic_options,
        heuristics_options,
    )
    if cfg.is_quorum_device_heuristics_enabled_with_no_exec():
        lib_env.report_processor.process(
            reports.corosync_quorum_heuristics_enabled_with_no_exec()
        )

    # First setup certificates for qdevice, then send corosync.conf to nodes.
    # If anything fails, nodes will not have corosync.conf with qdevice in it,
    # so there is no effect on the cluster.
    if lib_env.is_corosync_conf_live:
        target_factory = lib_env.get_node_target_factory()
        target_list = target_factory.get_target_list(
            cluster_nodes_names, skip_non_existing=skip_offline_nodes,
        )
        # Do model specific configuration.
        # If the model is not known to pcs and was forced, do not configure
        # anything else than corosync.conf, as we do not know what to do
        # anyway.
        if model == "net":
            qdevice_net.set_up_client_certificates(
                lib_env.cmd_runner(),
                lib_env.report_processor,
                lib_env.communicator_factory,
                # We are sure the "host" key is there, it has been validated
                # above.
                target_factory.get_target_from_hostname(model_options["host"]),
                cfg.get_cluster_name(),
                target_list,
                skip_offline_nodes
            )

        lib_env.report_processor.process(
            reports.service_enable_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Enable(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # everything set up, it's safe to tell the nodes to use qdevice
    lib_env.push_corosync_conf(cfg, skip_offline_nodes)

    # Now, when corosync.conf has been reloaded, we can start qdevice service.
    if lib_env.is_corosync_conf_live:
        lib_env.report_processor.process(
            reports.service_start_started("corosync-qdevice")
        )
        com_cmd = qdevice_com.Start(
            lib_env.report_processor, skip_offline_nodes
        )
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
Exemplo n.º 18
0
def config_destroy(env, instance_name=None, ignore_config_load_problems=False):
    # pylint: disable=too-many-branches
    """
    remove booth configuration files

    LibraryEnvironment env
    string instance_name -- booth instance name
    bool ignore_config_load_problems -- delete as much as possible when unable
            to read booth configs for the given booth instance
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    instance_name = booth_env.instance_name
    _ensure_live_env(env, booth_env)

    # TODO use constants in reports
    config_is_used = partial(booth_reports.booth_config_is_used, instance_name)
    if resource.find_for_config(
            get_resources(env.get_cib()),
            booth_env.config_path,
    ):
        report_processor.report(config_is_used("in cluster resource"))
    # Only systemd is currently supported. Initd does not supports multiple
    # instances (here specified by name)
    if external.is_systemctl():
        if external.is_service_running(env.cmd_runner(), "booth",
                                       instance_name):
            report_processor.report(config_is_used("(running in systemd)"))

        if external.is_service_enabled(env.cmd_runner(), "booth",
                                       instance_name):
            report_processor.report(config_is_used("(enabled in systemd)"))
    if report_processor.has_errors:
        raise LibraryError()

    try:
        authfile_path = None
        booth_conf = booth_env.config.read_to_facade()
        authfile_path = booth_conf.get_authfile()
    except RawFileError as e:
        report_processor.report(
            raw_file_error_report(
                e,
                force_code=report_codes.FORCE_BOOTH_DESTROY,
                is_forced_or_warning=ignore_config_load_problems,
            ))
    except ParserErrorException as e:
        report_processor.report_list(
            booth_env.config.parser_exception_to_report_list(
                e,
                force_code=report_codes.FORCE_BOOTH_DESTROY,
                is_forced_or_warning=ignore_config_load_problems,
            ))
    if report_processor.has_errors:
        raise LibraryError()

    if authfile_path:
        authfile_dir, authfile_name = os.path.split(authfile_path)
        if (authfile_dir == settings.booth_config_dir) and authfile_name:
            try:
                key_file = FileInstance.for_booth_key(authfile_name)
                key_file.raw_file.remove(fail_if_file_not_found=False)
            except RawFileError as e:
                report_processor.report(
                    raw_file_error_report(
                        e,
                        force_code=report_codes.FORCE_BOOTH_DESTROY,
                        is_forced_or_warning=ignore_config_load_problems,
                    ))
        else:
            report_processor.report(
                booth_reports.booth_unsupported_file_location(
                    authfile_path,
                    settings.booth_config_dir,
                    file_type_codes.BOOTH_KEY,
                ))
    if report_processor.has_errors:
        raise LibraryError()

    try:
        booth_env.config.raw_file.remove()
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))

    if report_processor.has_errors:
        raise LibraryError()
Exemplo n.º 19
0
def send_all_config_to_node(communicator,
                            reporter,
                            target_list,
                            rewrite_existing=False,
                            skip_wrong_config=False):
    """
    Send all booth configs from default booth config directory and theri
    authfiles to specified node.

    communicator -- NodeCommunicator
    reporter -- report processor
    target_list list -- list of targets to which configs should be delivered
    rewrite_existing -- if True rewrite existing file
    skip_wrong_config -- if True skip local configs that are unreadable
    """
    # TODO adapt to new file transfer framework once it is written
    # TODO the function is not modular enough - it raises LibraryError
    _reporter = SimpleReportProcessor(reporter)

    file_list = []
    for conf_file_name in sorted(config_files.get_all_configs_file_names()):
        config_file = FileInstance.for_booth_config(conf_file_name)
        try:
            booth_conf_data = config_file.raw_file.read()
            authfile_name, authfile_data, authfile_report_list = (
                config_files.get_authfile_name_and_data(
                    config_file.raw_to_facade(booth_conf_data)))
            _reporter.report_list(authfile_report_list)
            file_list.append({
                "name": conf_file_name,
                "data": booth_conf_data.decode("utf-8"),
                "is_authfile": False
            })
            if authfile_name and authfile_data:
                file_list.append({
                    "name":
                    authfile_name,
                    "data":
                    base64.b64encode(authfile_data).decode("utf-8"),
                    "is_authfile":
                    True
                })
        except RawFileError as e:
            _reporter.report(
                raw_file_error_report(
                    e,
                    force_code=report_codes.SKIP_UNREADABLE_CONFIG,
                    is_forced_or_warning=skip_wrong_config,
                ))
        except ParserErrorException as e:
            _reporter.report_list(
                config_file.parser_exception_to_report_list(
                    e,
                    force_code=report_codes.SKIP_UNREADABLE_CONFIG,
                    is_forced_or_warning=skip_wrong_config,
                ))
    if _reporter.has_errors:
        raise LibraryError()

    if not file_list:
        # no booth configs exist, nothing to be synced
        return

    _reporter.report(booth_reports.booth_config_distribution_started())
    com_cmd = BoothSaveFiles(_reporter,
                             file_list,
                             rewrite_existing=rewrite_existing)
    com_cmd.set_targets(target_list)
    run(communicator, com_cmd)

    if _reporter.has_errors:
        raise LibraryError()
Exemplo n.º 20
0
def node_add_remote(
    env,
    node_name,
    node_addr,
    operations,
    meta_attributes,
    instance_attributes,
    skip_offline_nodes=False,
    allow_incomplete_distribution=False,
    allow_pacemaker_remote_service_fail=False,
    allow_invalid_operation=False,
    allow_invalid_instance_attributes=False,
    use_default_operations=True,
    wait=False,
):
    # pylint: disable=too-many-arguments, too-many-branches, too-many-locals
    """
    create an ocf:pacemaker:remote resource and use it as a remote node

    LibraryEnvironment env -- provides all for communication with externals
    string node_name -- the name of the new node
    mixed node_addr -- the address of the new node or None for default
    list of dict operations -- attributes for each entered operation
    dict meta_attributes -- attributes for primitive/meta_attributes
    dict instance_attributes -- attributes for primitive/instance_attributes
    bool skip_offline_nodes -- if True, ignore when some nodes are offline
    bool allow_incomplete_distribution -- if True, allow this command to
        finish successfully even if file distribution did not succeed
    bool allow_pacemaker_remote_service_fail -- if True, allow this command to
        finish successfully even if starting/enabling pacemaker_remote did not
        succeed
    bool allow_invalid_operation -- if True, allow to use operations that
        are not listed in a resource agent metadata
    bool allow_invalid_instance_attributes -- if True, allow to use instance
        attributes that are not listed in a resource agent metadata and allow to
        omit required instance_attributes
    bool use_default_operations -- if True, add operations specified in
        a resource agent metadata to the resource
    mixed wait -- a flag for controlling waiting for pacemaker idle mechanism
    """
    env.ensure_wait_satisfiable(wait)

    report_processor = SimpleReportProcessor(env.report_processor)
    cib = env.get_cib()
    id_provider = IdProvider(cib)
    if env.is_cib_live:
        corosync_conf = env.get_corosync_conf()
    else:
        corosync_conf = None
        report_processor.report(
            reports.corosync_node_conflict_check_skipped("not_live_cib"))
    existing_nodes_names, existing_nodes_addrs, report_list = (
        get_existing_nodes_names_addrs(corosync_conf, cib))
    if env.is_cib_live:
        # We just reported corosync checks are going to be skipped so we
        # shouldn't complain about errors related to corosync nodes
        report_processor.report_list(report_list)

    resource_agent = remote_node.get_agent(env.report_processor,
                                           env.cmd_runner())

    existing_target_list = []
    if env.is_cib_live:
        target_factory = env.get_node_target_factory()
        existing_target_list, new_target_list = _get_targets_for_add(
            target_factory, report_processor, existing_nodes_names,
            [node_name], skip_offline_nodes)
        new_target = new_target_list[0] if new_target_list else None
        # default node_addr to an address from known-hosts
        if node_addr is None:
            node_addr = new_target.first_addr if new_target else node_name
            report_processor.report(
                reports.using_known_host_address_for_host(
                    node_name, node_addr))
    else:
        # default node_addr to an address from known-hosts
        if node_addr is None:
            known_hosts = env.get_known_hosts([node_name])
            node_addr = known_hosts[0].dest.addr if known_hosts else node_name
            report_processor.report(
                reports.using_known_host_address_for_host(
                    node_name, node_addr))

    # validate inputs
    report_list = remote_node.validate_create(existing_nodes_names,
                                              existing_nodes_addrs,
                                              resource_agent, node_name,
                                              node_addr, instance_attributes)
    # validation + cib setup
    # TODO extract the validation to a separate function
    try:
        remote_resource_element = remote_node.create(
            env.report_processor,
            resource_agent,
            get_resources(cib),
            id_provider,
            node_addr,
            node_name,
            operations,
            meta_attributes,
            instance_attributes,
            allow_invalid_operation,
            allow_invalid_instance_attributes,
            use_default_operations,
        )
    except LibraryError as e:
        #Check unique id conflict with check against nodes. Until validation
        #resource create is not separated, we need to make unique post
        #validation.
        already_exists = []
        unified_report_list = []
        for report in report_list + list(e.args):
            if report.code not in (
                    report_codes.ID_ALREADY_EXISTS,
                    report_codes.RESOURCE_INSTANCE_ATTR_VALUE_NOT_UNIQUE,
            ):
                unified_report_list.append(report)
            elif ("id" in report.info
                  and report.info["id"] not in already_exists):
                unified_report_list.append(report)
                already_exists.append(report.info["id"])
        report_list = unified_report_list

    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    # everything validated, let's set it up
    if env.is_cib_live:
        _prepare_pacemaker_remote_environment(
            env,
            report_processor,
            existing_target_list,
            new_target,
            node_name,
            skip_offline_nodes,
            allow_incomplete_distribution,
            allow_pacemaker_remote_service_fail,
        )
    else:
        report_processor.report_list(
            _reports_skip_new_node(node_name, "not_live_cib"))

    env.push_cib(wait=wait)
    if wait:
        _ensure_resource_running(env, remote_resource_element.attrib["id"])
Exemplo n.º 21
0
def node_add_guest(
    env,
    node_name,
    resource_id,
    options,
    skip_offline_nodes=False,
    allow_incomplete_distribution=False,
    allow_pacemaker_remote_service_fail=False,
    wait=False,
):
    # pylint: disable=too-many-locals
    """
    Make a guest node from the specified resource

    LibraryEnvironment env -- provides all for communication with externals
    string node_name -- name of the guest node
    string resource_id -- specifies resource that should become a guest node
    dict options -- guest node options (remote-port, remote-addr,
        remote-connect-timeout)
    bool skip_offline_nodes -- if True, ignore when some nodes are offline
    bool allow_incomplete_distribution -- if True, allow this command to
        finish successfully even if file distribution did not succeed
    bool allow_pacemaker_remote_service_fail -- if True, allow this command to
        finish successfully even if starting/enabling pacemaker_remote did not
        succeed
    mixed wait -- a flag for controlling waiting for pacemaker idle mechanism
    """
    env.ensure_wait_satisfiable(wait)

    report_processor = SimpleReportProcessor(env.report_processor)
    cib = env.get_cib()
    id_provider = IdProvider(cib)
    if env.is_cib_live:
        corosync_conf = env.get_corosync_conf()
    else:
        corosync_conf = None
        report_processor.report(
            reports.corosync_node_conflict_check_skipped("not_live_cib"))
    existing_nodes_names, existing_nodes_addrs, report_list = (
        get_existing_nodes_names_addrs(corosync_conf, cib))
    if env.is_cib_live:
        # We just reported corosync checks are going to be skipped so we
        # shouldn't complain about errors related to corosync nodes
        report_processor.report_list(report_list)

    existing_target_list = []
    if env.is_cib_live:
        target_factory = env.get_node_target_factory()
        existing_target_list, new_target_list = _get_targets_for_add(
            target_factory, report_processor, existing_nodes_names,
            [node_name], skip_offline_nodes)
        new_target = new_target_list[0] if new_target_list else None
        # default remote-addr to an address from known-hosts
        if "remote-addr" not in options or options["remote-addr"] is None:
            new_addr = new_target.first_addr if new_target else node_name
            options["remote-addr"] = new_addr
            report_processor.report(
                reports.using_known_host_address_for_host(node_name, new_addr))
    else:
        # default remote-addr to an address from known-hosts
        if "remote-addr" not in options or options["remote-addr"] is None:
            known_hosts = env.get_known_hosts([node_name])
            new_addr = known_hosts[0].dest.addr if known_hosts else node_name
            options["remote-addr"] = new_addr
            report_processor.report(
                reports.using_known_host_address_for_host(node_name, new_addr))

    # validate inputs
    report_list = guest_node.validate_set_as_guest(cib, existing_nodes_names,
                                                   existing_nodes_addrs,
                                                   node_name, options)
    searcher = ElementSearcher(primitive.TAG, resource_id, get_resources(cib))
    if searcher.element_found():
        resource_element = searcher.get_element()
        report_list.extend(guest_node.validate_is_not_guest(resource_element))
    else:
        report_list.extend(searcher.get_errors())

    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    # everything validated, let's set it up
    guest_node.set_as_guest(
        resource_element,
        id_provider,
        node_name,
        options.get("remote-addr", None),
        options.get("remote-port", None),
        options.get("remote-connect-timeout", None),
    )

    if env.is_cib_live:
        _prepare_pacemaker_remote_environment(
            env,
            report_processor,
            existing_target_list,
            new_target,
            node_name,
            skip_offline_nodes,
            allow_incomplete_distribution,
            allow_pacemaker_remote_service_fail,
        )
    else:
        report_processor.report_list(
            _reports_skip_new_node(node_name, "not_live_cib"))

    env.push_cib(wait=wait)
    if wait:
        _ensure_resource_running(env, resource_id)
Exemplo n.º 22
0
def create_in_cluster(env,
                      ip,
                      instance_name=None,
                      allow_absent_resource_agent=False):
    """
    Create group with ip resource and booth resource

    LibraryEnvironment env -- provides all for communication with externals
    string ip -- float ip address for the operation of the booth
    string instance_name -- booth instance name
    bool allow_absent_resource_agent -- allowing creating booth resource even
        if its agent is not installed
    """
    report_processor = SimpleReportProcessor(env.report_processor)
    booth_env = env.get_booth_env(instance_name)
    # Booth config path goes to CIB. Working with a mocked booth configs would
    # not work coorectly as the path would point to a mock file (the path to a
    # mock file is unknown to us in the lib anyway)
    # It makes sense to work with a mocked CIB, though. Users can do other
    # changes to the CIB and push them to the cluster at once.
    _ensure_live_booth_env(booth_env)
    resources_section = get_resources(env.get_cib())
    id_provider = IdProvider(resources_section)
    instance_name = booth_env.instance_name

    # validate
    if resource.find_for_config(resources_section, booth_env.config_path):
        report_processor.report(
            booth_reports.booth_already_in_cib(instance_name))
    # verify the config exists and is readable
    try:
        booth_env.config.raw_file.read()
    except RawFileError as e:
        report_processor.report(raw_file_error_report(e))
    if report_processor.has_errors:
        raise LibraryError()
    # validation done

    create_id = partial(resource.create_resource_id, resources_section,
                        instance_name)
    get_agent = partial(find_valid_resource_agent_by_name,
                        env.report_processor,
                        env.cmd_runner(),
                        allowed_absent=allow_absent_resource_agent)
    create_primitive = partial(primitive.create, env.report_processor,
                               resources_section, id_provider)
    into_booth_group = partial(
        group.place_resource,
        group.provide_group(resources_section, create_id("group")),
    )

    into_booth_group(
        create_primitive(
            create_id("ip"),
            get_agent("ocf:heartbeat:IPaddr2"),
            instance_attributes={"ip": ip},
        ))
    into_booth_group(
        create_primitive(
            create_id("service"),
            get_agent("ocf:pacemaker:booth-site"),
            instance_attributes={"config": booth_env.config_path},
        ))

    env.push_cib()
Exemplo n.º 23
0
def add_device(lib_env,
               model,
               model_options,
               generic_options,
               heuristics_options,
               force_model=False,
               force_options=False,
               skip_offline_nodes=False):
    """
    Add a quorum device to a cluster, distribute and reload configs if live

    string model -- quorum device model
    dict model_options -- model specific options
    dict generic_options -- generic quorum device options
    dict heuristics_options -- heuristics options
    bool force_model -- continue even if the model is not valid
    bool force_options -- continue even if options are not valid
    bool skip_offline_nodes -- continue even if not all nodes are accessible
    """
    cfg = lib_env.get_corosync_conf()
    if cfg.has_quorum_device():
        raise LibraryError(reports.qdevice_already_defined())

    report_processor = SimpleReportProcessor(lib_env.report_processor)
    report_processor.report_list(
        corosync_conf_validators.add_quorum_device(
            model,
            model_options,
            generic_options,
            heuristics_options, [node.nodeid for node in cfg.get_nodes()],
            force_model=force_model,
            force_options=force_options))

    if lib_env.is_corosync_conf_live:
        cluster_nodes_names, report_list = get_existing_nodes_names(
            cfg,
            # Pcs is unable to communicate with nodes missing names. It cannot
            # send new corosync.conf to them. That might break the cluster.
            # Hence we error out.
            error_on_missing_name=True)
        report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    cfg.add_quorum_device(
        model,
        model_options,
        generic_options,
        heuristics_options,
    )
    if cfg.is_quorum_device_heuristics_enabled_with_no_exec():
        lib_env.report_processor.process(
            reports.corosync_quorum_heuristics_enabled_with_no_exec())

    # First setup certificates for qdevice, then send corosync.conf to nodes.
    # If anything fails, nodes will not have corosync.conf with qdevice in it,
    # so there is no effect on the cluster.
    if lib_env.is_corosync_conf_live:
        target_factory = lib_env.get_node_target_factory()
        target_list = target_factory.get_target_list(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        )
        # Do model specific configuration.
        # If the model is not known to pcs and was forced, do not configure
        # anything else than corosync.conf, as we do not know what to do
        # anyway.
        if model == "net":
            qdevice_net.set_up_client_certificates(
                lib_env.cmd_runner(),
                lib_env.report_processor,
                lib_env.communicator_factory,
                # We are sure the "host" key is there, it has been validated
                # above.
                target_factory.get_target_from_hostname(model_options["host"]),
                cfg.get_cluster_name(),
                target_list,
                skip_offline_nodes)

        lib_env.report_processor.process(
            reports.service_enable_started("corosync-qdevice"))
        com_cmd = qdevice_com.Enable(lib_env.report_processor,
                                     skip_offline_nodes)
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)

    # everything set up, it's safe to tell the nodes to use qdevice
    lib_env.push_corosync_conf(cfg, skip_offline_nodes)

    # Now, when corosync.conf has been reloaded, we can start qdevice service.
    if lib_env.is_corosync_conf_live:
        lib_env.report_processor.process(
            reports.service_start_started("corosync-qdevice"))
        com_cmd = qdevice_com.Start(lib_env.report_processor,
                                    skip_offline_nodes)
        com_cmd.set_targets(target_list)
        run_and_raise(lib_env.get_node_communicator(), com_cmd)
Exemplo n.º 24
0
def setup(env,
          cluster_name,
          nodes,
          transport_type=None,
          transport_options=None,
          link_list=None,
          compression_options=None,
          crypto_options=None,
          totem_options=None,
          quorum_options=None,
          wait=False,
          start=False,
          enable=False,
          force=False,
          force_unresolvable=False):
    """
    Set up cluster on specified nodes.
    Validation of the inputs is done here. Possible existing clusters are
    destroyed (when using force). Authkey files for corosync and pacemaer,
    known hosts and and newly generated corosync.conf are distributed to all
    nodes.
    Raise LibraryError on any error.

    env LibraryEnvironment
    cluster_name string -- name of a cluster to set up
    nodes list -- list of dicts which represents node.
        Supported keys are: name (required), addrs
    transport_type string -- transport type of a cluster
    transport_options dict -- transport specific options
    link_list list of dict -- list of links, depends of transport_type
    compression_options dict -- only available for transport_type == 'knet'. In
        corosync.conf they are prefixed 'knet_compression_'
    crypto_options dict -- only available for transport_type == 'knet'. In
        corosync.conf they are prefixed 'crypto_'
    totem_options dict -- options of section 'totem' in corosync.conf
    quorum_options dict -- options of section 'quorum' in corosync.conf
    wait -- specifies if command should try to wait for cluster to start up.
        Has no effect start is False. If set to False command will not wait for
        cluster to start. If None command will wait for some default timeout.
        If int wait set timeout to int value of seconds.
    start bool -- if True start cluster when it is set up
    enable bool -- if True enable cluster when it is set up
    force bool -- if True some validations errors are treated as warnings
    force_unresolvable bool -- if True not resolvable addresses of nodes are
        treated as warnings
    """
    _ensure_live_env(env)  # raises if env is not live

    transport_options = transport_options or {}
    link_list = link_list or []
    compression_options = compression_options or {}
    crypto_options = crypto_options or {}
    totem_options = totem_options or {}
    quorum_options = quorum_options or {}
    nodes = [_normalize_dict(node, {"addrs"}) for node in nodes]

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()

    # Get targets for all nodes and report unknown (== not-authorized) nodes.
    # If a node doesn't contain the 'name' key, validation of inputs reports it.
    # That means we don't report missing names but cannot rely on them being
    # present either.
    target_report_list, target_list = (
        target_factory.get_target_list_with_reports(
            [node["name"] for node in nodes if "name" in node],
            allow_skip=False,
        ))
    report_processor.report_list(target_report_list)

    # Use an address defined in known-hosts for each node with no addresses
    # specified. This allows users not to specify node addresses at all which
    # simplifies the whole cluster setup command / form significantly.
    addrs_defaulter = _get_addrs_defaulter(
        report_processor, {target.label: target
                           for target in target_list})
    nodes = [
        _set_defaults_in_dict(node, {"addrs": addrs_defaulter})
        for node in nodes
    ]

    # Validate inputs.
    report_processor.report_list(
        config_validators.create(cluster_name,
                                 nodes,
                                 transport_type,
                                 force_unresolvable=force_unresolvable))
    if transport_type in corosync_constants.TRANSPORTS_KNET:
        max_link_number = max([len(node["addrs"]) for node in nodes],
                              default=0)
        report_processor.report_list(
            config_validators.create_transport_knet(
                transport_options, compression_options, crypto_options) +
            config_validators.create_link_list_knet(link_list, max_link_number)
        )
    elif transport_type in corosync_constants.TRANSPORTS_UDP:
        report_processor.report_list(
            config_validators.create_transport_udp(
                transport_options, compression_options, crypto_options) +
            config_validators.create_link_list_udp(link_list))
    report_processor.report_list(
        config_validators.create_totem(totem_options) +
        # We are creating the config and we know there is no qdevice in it.
        config_validators.create_quorum_options(quorum_options, False))

    # Validate flags
    wait_timeout = _get_validated_wait_timeout(report_processor, wait, start)

    # Validate the nodes
    com_cmd = GetHostInfo(report_processor)
    com_cmd.set_targets(target_list)
    report_processor.report_list(
        _host_check_cluster_setup(
            run_com(env.get_node_communicator(), com_cmd), force))

    if report_processor.has_errors:
        raise LibraryError()

    # Validation done. If errors occured, an exception has been raised and we
    # don't get below this line.

    # Destroy cluster on all nodes.
    com_cmd = cluster.Destroy(env.report_processor)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute auth tokens.
    com_cmd = UpdateKnownHosts(
        env.report_processor,
        known_hosts_to_add=env.get_known_hosts(
            [target.label for target in target_list]),
        known_hosts_to_remove=[],
    )
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute configuration files except corosync.conf. Sending
    # corosync.conf serves as a "commit" as its presence on a node marks the
    # node as a part of a cluster.
    corosync_authkey = generate_binary_key(random_bytes_count=128)
    pcmk_authkey = generate_binary_key(random_bytes_count=128)
    actions = {}
    actions.update(
        node_communication_format.corosync_authkey_file(corosync_authkey))
    actions.update(node_communication_format.pcmk_authkey_file(pcmk_authkey))
    com_cmd = DistributeFilesWithoutForces(env.report_processor, actions)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)
    # TODO This should be in the previous call but so far we don't have a call
    # which allows to save and delete files at the same time.
    com_cmd = RemoveFilesWithoutForces(
        env.report_processor,
        {"pcsd settings": {
            "type": "pcsd_settings"
        }},
    )
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute and reload pcsd SSL certificate
    report_processor.report(
        reports.pcsd_ssl_cert_and_key_distribution_started(
            [target.label for target in target_list]))
    ssl_key_raw = ssl.generate_key()
    ssl_key = ssl.dump_key(ssl_key_raw)
    ssl_cert = ssl.dump_cert(
        ssl.generate_cert(ssl_key_raw, target_list[0].label))
    com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key)
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Create and distribute corosync.conf. Once a node saves corosync.conf it
    # is considered to be in a cluster.
    corosync_conf = config_facade.ConfigFacade.create(cluster_name, nodes,
                                                      transport_type)
    corosync_conf.set_totem_options(totem_options)
    corosync_conf.set_quorum_options(quorum_options)
    corosync_conf.create_link_list(link_list)
    if transport_type in corosync_constants.TRANSPORTS_KNET:
        corosync_conf.set_transport_knet_options(transport_options,
                                                 compression_options,
                                                 crypto_options)
    elif transport_type in corosync_constants.TRANSPORTS_UDP:
        corosync_conf.set_transport_udp_options(transport_options)

    com_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.corosync_conf_file(
            corosync_conf.config.export()),
    )
    com_cmd.set_targets(target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    env.report_processor.process(reports.cluster_setup_success())

    # Optionally enable and start cluster services.
    if enable:
        com_cmd = EnableCluster(env.report_processor)
        com_cmd.set_targets(target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
    if start:
        _start_cluster(
            env.communicator_factory,
            env.report_processor,
            target_list,
            wait_timeout=wait_timeout,
        )
Exemplo n.º 25
0
def node_add_guest(
    env, node_name, resource_id, options,
    skip_offline_nodes=False,
    allow_incomplete_distribution=False,
    allow_pacemaker_remote_service_fail=False,
    wait=False,
):
    # pylint: disable=too-many-locals
    """
    Make a guest node from the specified resource

    LibraryEnvironment env -- provides all for communication with externals
    string node_name -- name of the guest node
    string resource_id -- specifies resource that should become a guest node
    dict options -- guest node options (remote-port, remote-addr,
        remote-connect-timeout)
    bool skip_offline_nodes -- if True, ignore when some nodes are offline
    bool allow_incomplete_distribution -- if True, allow this command to
        finish successfully even if file distribution did not succeed
    bool allow_pacemaker_remote_service_fail -- if True, allow this command to
        finish successfully even if starting/enabling pacemaker_remote did not
        succeed
    mixed wait -- a flag for controlling waiting for pacemaker idle mechanism
    """
    env.ensure_wait_satisfiable(wait)

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    cib = env.get_cib()
    id_provider = IdProvider(cib)
    if env.is_cib_live:
        corosync_conf = env.get_corosync_conf()
    else:
        corosync_conf = None
        report_processor.report(
            reports.corosync_node_conflict_check_skipped("not_live_cib")
        )
    existing_nodes_names, existing_nodes_addrs, report_list = (
        get_existing_nodes_names_addrs(corosync_conf, cib)
    )
    if env.is_cib_live:
        # We just reported corosync checks are going to be skipped so we
        # shouldn't complain about errors related to corosync nodes
        report_processor.report_list(report_list)

    existing_target_list = []
    if env.is_cib_live:
        existing_target_list, new_target_list = _get_targets_for_add(
            target_factory, report_processor, existing_nodes_names, [node_name],
            skip_offline_nodes
        )
        new_target = new_target_list[0] if new_target_list else None
        # default remote-addr to an address from known-hosts
        if "remote-addr" not in options or options["remote-addr"] is None:
            new_addr = new_target.first_addr if new_target else node_name
            options["remote-addr"] = new_addr
            report_processor.report(
                reports.using_known_host_address_for_host(node_name, new_addr)
            )
    else:
        # default remote-addr to an address from known-hosts
        if "remote-addr" not in options or options["remote-addr"] is None:
            known_hosts = env.get_known_hosts([node_name])
            new_addr = known_hosts[0].dest.addr if known_hosts else node_name
            options["remote-addr"] = new_addr
            report_processor.report(
                reports.using_known_host_address_for_host(node_name, new_addr)
            )

    # validate inputs
    report_list = guest_node.validate_set_as_guest(
        cib,
        existing_nodes_names,
        existing_nodes_addrs,
        node_name,
        options
    )
    searcher = ElementSearcher(primitive.TAG, resource_id, get_resources(cib))
    if searcher.element_found():
        resource_element = searcher.get_element()
        report_list.extend(guest_node.validate_is_not_guest(resource_element))
    else:
        report_list.extend(searcher.get_errors())

    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    # everything validated, let's set it up
    guest_node.set_as_guest(
        resource_element,
        id_provider,
        node_name,
        options.get("remote-addr", None),
        options.get("remote-port", None),
        options.get("remote-connect-timeout", None),
    )

    if env.is_cib_live:
        _prepare_pacemaker_remote_environment(
            env,
            report_processor,
            existing_target_list,
            new_target,
            node_name,
            skip_offline_nodes,
            allow_incomplete_distribution,
            allow_pacemaker_remote_service_fail,
        )
    else:
        report_processor.report_list(
            _reports_skip_new_node(node_name, "not_live_cib")
        )

    env.push_cib(wait=wait)
    if wait:
        _ensure_resource_running(env, resource_id)
Exemplo n.º 26
0
def add_nodes(
    env,
    nodes,
    wait=False,
    start=False,
    enable=False,
    force=False,
    force_unresolvable=False,
    skip_offline_nodes=False,
    no_watchdog_validation=False,
):
    # pylint: disable=too-many-locals
    """
    Add specified nodes to the local cluster
    Raise LibraryError on any error.

    env LibraryEnvironment
    nodes list -- list of dicts which represents node.
        Supported keys are: name (required), addrs (list), devices (list),
        watchdog
    wait -- specifies if command should try to wait for cluster to start up.
        Has no effect start is False. If set to False command will not wait for
        cluster to start. If None command will wait for some default timeout.
        If int wait set timeout to int value of seconds.
    start bool -- if True start cluster when it is set up
    enable bool -- if True enable cluster when it is set up
    force bool -- if True some validations errors are treated as warnings
    force_unresolvable bool -- if True not resolvable addresses of nodes are
        treated as warnings
    skip_offline_nodes bool -- if True non fatal connection failures to other
        hosts are treated as warnings
    no_watchdog_validation bool -- if True do not validate specified watchdogs
        on remote hosts
    """
    _ensure_live_env(env)  # raises if env is not live

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    is_sbd_enabled = sbd.is_sbd_enabled(env.cmd_runner())
    corosync_conf = env.get_corosync_conf()
    cluster_nodes_names = corosync_conf.get_nodes_names()
    corosync_node_options = {"name", "addrs"}
    sbd_node_options = {"devices", "watchdog"}

    keys_to_normalize = {"addrs"}
    if is_sbd_enabled:
        keys_to_normalize |= sbd_node_options
    new_nodes = [_normalize_dict(node, keys_to_normalize) for node in nodes]

    # get targets for existing nodes
    target_report_list, cluster_nodes_target_list = (
        target_factory.get_target_list_with_reports(
            cluster_nodes_names,
            skip_non_existing=skip_offline_nodes,
        ))
    report_processor.report_list(target_report_list)
    # get a target for qnetd if needed
    qdevice_model, qdevice_model_options, _, _ = (
        corosync_conf.get_quorum_device_settings())
    if qdevice_model == "net":
        try:
            qnetd_target = target_factory.get_target(
                qdevice_model_options["host"])
        except HostNotFound:
            report_processor.report(
                reports.host_not_found([qdevice_model_options["host"]]))

    # Get targets for new nodes and report unknown (== not-authorized) nodes.
    # If a node doesn't contain the 'name' key, validation of inputs reports it.
    # That means we don't report missing names but cannot rely on them being
    # present either.
    target_report_list, new_nodes_target_list = (
        target_factory.get_target_list_with_reports(
            [node["name"] for node in new_nodes if "name" in node],
            allow_skip=False,
        ))
    report_processor.report_list(target_report_list)

    # Set default values for not-specified node options.
    # Use an address defined in known-hosts for each node with no addresses
    # specified. This allows users not to specify node addresses at all which
    # simplifies the whole node add command / form significantly.
    new_nodes_target_dict = {
        target.label: target
        for target in new_nodes_target_list
    }
    addrs_defaulter = _get_addrs_defaulter(report_processor,
                                           new_nodes_target_dict)
    new_nodes_defaulters = {"addrs": addrs_defaulter}
    if is_sbd_enabled:
        watchdog_defaulter = _get_watchdog_defaulter(report_processor,
                                                     new_nodes_target_dict)
        new_nodes_defaulters["devices"] = lambda _: []
        new_nodes_defaulters["watchdog"] = watchdog_defaulter
    new_nodes = [
        _set_defaults_in_dict(node, new_nodes_defaulters) for node in new_nodes
    ]
    new_nodes_dict = {
        node["name"]: node
        for node in new_nodes if "name" in node
    }

    # Validate inputs - node options names
    # We do not want to make corosync validators know about SBD options and
    # vice versa. Therefore corosync and SBD validators get only valid corosync
    # and SBD options respectively, and we need to check for any surplus
    # options here.
    report_processor.report_list(
        validate_names_in(
            corosync_node_options | sbd_node_options,
            set([
                option for node_options in [node.keys() for node in new_nodes]
                for option in node_options
            ]),
            option_type="node"))

    # Validate inputs - corosync part
    try:
        cib = env.get_cib()
        cib_nodes = get_remote_nodes(cib) + get_guest_nodes(cib)
    except LibraryError:
        cib_nodes = []
        report_processor.report(
            reports.get_problem_creator(
                report_codes.FORCE_LOAD_NODES_FROM_CIB,
                force)(reports.cib_load_error_get_nodes_for_validation))
    # corosync validator rejects non-corosync keys
    new_nodes_corosync = [{
        key: node[key]
        for key in corosync_node_options if key in node
    } for node in new_nodes]
    report_processor.report_list(
        config_validators.add_nodes(new_nodes_corosync,
                                    corosync_conf.get_nodes(),
                                    cib_nodes,
                                    force_unresolvable=force_unresolvable))

    # Validate inputs - SBD part
    if is_sbd_enabled:
        report_processor.report_list(
            sbd.validate_new_nodes_devices({
                node["name"]: node["devices"]
                for node in new_nodes if "name" in node
            }))
    else:
        for node in new_nodes:
            sbd_options = sbd_node_options.intersection(node.keys())
            if sbd_options and "name" in node:
                report_processor.report(
                    reports.sbd_not_used_cannot_set_sbd_options(
                        sbd_options, node["name"]))

    # Validate inputs - flags part
    wait_timeout = _get_validated_wait_timeout(report_processor, wait, start)

    # Get online cluster nodes
    # This is the only call in which we accept skip_offline_nodes option for the
    # cluster nodes. In all the other actions we communicate only with the
    # online nodes. This allows us to simplify code as any communication issue
    # is considered an error, ends the command processing and is not possible
    # to skip it by skip_offline_nodes. We do not have to care about a situation
    # when a communication command cannot connect to some nodes and then the
    # next command can connect but fails due to the previous one did not
    # succeed.
    online_cluster_target_list = []
    if cluster_nodes_target_list:
        com_cmd = GetOnlineTargets(
            report_processor,
            ignore_offline_targets=skip_offline_nodes,
        )
        com_cmd.set_targets(cluster_nodes_target_list)
        online_cluster_target_list = run_com(env.get_node_communicator(),
                                             com_cmd)
        offline_cluster_target_list = [
            target for target in cluster_nodes_target_list
            if target not in online_cluster_target_list
        ]
        if len(online_cluster_target_list) == 0:
            report_processor.report(
                reports.unable_to_perform_operation_on_any_node())
        elif offline_cluster_target_list and skip_offline_nodes:
            # TODO: report (warn) how to fix offline nodes when they come online
            # report_processor.report(None)
            pass

    # Validate existing cluster nodes status
    atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(),
                                                      corosync_conf,
                                                      len(new_nodes))
    if atb_has_to_be_enabled:
        report_processor.report(
            reports.corosync_quorum_atb_will_be_enabled_due_to_sbd())
        if online_cluster_target_list:
            com_cmd = CheckCorosyncOffline(
                report_processor,
                allow_skip_offline=False,
            )
            com_cmd.set_targets(online_cluster_target_list)
            run_com(env.get_node_communicator(), com_cmd)

    # Validate new nodes. All new nodes have to be online.
    com_cmd = GetHostInfo(report_processor)
    com_cmd.set_targets(new_nodes_target_list)
    report_processor.report_list(
        _host_check_cluster_setup(
            run_com(env.get_node_communicator(), com_cmd),
            force,
            # version of services may not be the same across the existing
            # cluster nodes, so it's not easy to make this check properly
            check_services_versions=False,
        ))

    # Validate SBD on new nodes
    if is_sbd_enabled:
        if no_watchdog_validation:
            report_processor.report(reports.sbd_watchdog_validation_inactive())
        com_cmd = CheckSbd(report_processor)
        for new_node_target in new_nodes_target_list:
            new_node = new_nodes_dict[new_node_target.label]
            # Do not send watchdog if validation is turned off. Listing of
            # available watchdogs in pcsd may restart the machine in some
            # corner cases.
            com_cmd.add_request(
                new_node_target,
                watchdog=""
                if no_watchdog_validation else new_node["watchdog"],
                device_list=new_node["devices"],
            )
        run_com(env.get_node_communicator(), com_cmd)

    if report_processor.has_errors:
        raise LibraryError()

    # Validation done. If errors occured, an exception has been raised and we
    # don't get below this line.

    # First set up everything else than corosync. Once the new nodes are present
    # in corosync.conf, they're considered part of a cluster and the node add
    # command cannot be run again. So we need to minimize the amout of actions
    # (and therefore possible failures) after adding the nodes to corosync.

    # distribute auth tokens of all cluster nodes (including the new ones) to
    # all new nodes
    com_cmd = UpdateKnownHosts(
        env.report_processor,
        known_hosts_to_add=env.get_known_hosts(cluster_nodes_names +
                                               list(new_nodes_dict.keys())),
        known_hosts_to_remove=[],
    )
    com_cmd.set_targets(new_nodes_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # qdevice setup
    if qdevice_model == "net":
        qdevice_net.set_up_client_certificates(
            env.cmd_runner(),
            env.report_processor,
            env.communicator_factory,
            qnetd_target,
            corosync_conf.get_cluster_name(),
            new_nodes_target_list,
            # we don't want to allow skiping offline nodes which are being
            # added, otherwise qdevice will not work properly
            skip_offline_nodes=False,
            allow_skip_offline=False)

    # sbd setup
    if is_sbd_enabled:
        sbd_cfg = environment_file_to_dict(sbd.get_local_sbd_config())

        com_cmd = SetSbdConfig(env.report_processor)
        for new_node_target in new_nodes_target_list:
            new_node = new_nodes_dict[new_node_target.label]
            com_cmd.add_request(
                new_node_target,
                sbd.create_sbd_config(
                    sbd_cfg,
                    new_node["name"],
                    watchdog=new_node["watchdog"],
                    device_list=new_node["devices"],
                ))
        run_and_raise(env.get_node_communicator(), com_cmd)

        com_cmd = EnableSbdService(env.report_processor)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
    else:
        com_cmd = DisableSbdService(env.report_processor)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)

    # booth setup
    booth_sync.send_all_config_to_node(
        env.get_node_communicator(),
        env.report_processor,
        new_nodes_target_list,
        rewrite_existing=force,
        skip_wrong_config=force,
    )

    # distribute corosync and pacemaker authkeys
    files_action = {}
    forceable_io_error_creator = reports.get_problem_creator(
        report_codes.SKIP_FILE_DISTRIBUTION_ERRORS, force)
    if os.path.isfile(settings.corosync_authkey_file):
        try:
            files_action.update(
                node_communication_format.corosync_authkey_file(
                    open(settings.corosync_authkey_file, "rb").read()))
        except EnvironmentError as e:
            report_processor.report(
                forceable_io_error_creator(
                    reports.file_io_error,
                    env_file_role_codes.COROSYNC_AUTHKEY,
                    file_path=settings.corosync_authkey_file,
                    operation="read",
                    reason=format_environment_error(e)))

    if os.path.isfile(settings.pacemaker_authkey_file):
        try:
            files_action.update(
                node_communication_format.pcmk_authkey_file(
                    open(settings.pacemaker_authkey_file, "rb").read()))
        except EnvironmentError as e:
            report_processor.report(
                forceable_io_error_creator(
                    reports.file_io_error,
                    env_file_role_codes.PACEMAKER_AUTHKEY,
                    file_path=settings.pacemaker_authkey_file,
                    operation="read",
                    reason=format_environment_error(e)))

    # pcs_settings.conf was previously synced using pcsdcli send_local_configs.
    # This has been changed temporarily until new system for distribution and
    # syncronization of configs will be introduced.
    if os.path.isfile(settings.pcsd_settings_conf_location):
        try:
            files_action.update(
                node_communication_format.pcs_settings_conf_file(
                    open(settings.pcsd_settings_conf_location, "r").read()))
        except EnvironmentError as e:
            report_processor.report(
                forceable_io_error_creator(
                    reports.file_io_error,
                    env_file_role_codes.PCS_SETTINGS_CONF,
                    file_path=settings.pcsd_settings_conf_location,
                    operation="read",
                    reason=format_environment_error(e)))

    # stop here if one of the files could not be loaded and it was not forced
    if report_processor.has_errors:
        raise LibraryError()

    if files_action:
        com_cmd = DistributeFilesWithoutForces(env.report_processor,
                                               files_action)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)

    # Distribute and reload pcsd SSL certificate
    report_processor.report(
        reports.pcsd_ssl_cert_and_key_distribution_started(
            [target.label for target in new_nodes_target_list]))

    try:
        with open(settings.pcsd_cert_location, "r") as f:
            ssl_cert = f.read()
    except EnvironmentError as e:
        report_processor.report(
            reports.file_io_error(
                env_file_role_codes.PCSD_SSL_CERT,
                file_path=settings.pcsd_cert_location,
                reason=format_environment_error(e),
                operation="read",
            ))
    try:
        with open(settings.pcsd_key_location, "r") as f:
            ssl_key = f.read()
    except EnvironmentError as e:
        report_processor.report(
            reports.file_io_error(
                env_file_role_codes.PCSD_SSL_KEY,
                file_path=settings.pcsd_key_location,
                reason=format_environment_error(e),
                operation="read",
            ))
    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key)
    com_cmd.set_targets(new_nodes_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # When corosync >= 2 is in use, the procedure for adding a node is:
    # 1. add the new node to corosync.conf on all existing nodes
    # 2. reload corosync.conf before the new node is started
    # 3. start the new node
    # If done otherwise, membership gets broken and qdevice hangs. Cluster
    # will recover after a minute or so but still it's a wrong way.

    corosync_conf.add_nodes(new_nodes_corosync)
    if atb_has_to_be_enabled:
        corosync_conf.set_quorum_options(dict(auto_tie_breaker="1"))

    com_cmd = DistributeCorosyncConf(
        env.report_processor,
        corosync_conf.config.export(),
        allow_skip_offline=False,
    )
    com_cmd.set_targets(online_cluster_target_list + new_nodes_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    com_cmd = ReloadCorosyncConf(env.report_processor)
    com_cmd.set_targets(online_cluster_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # Optionally enable and start cluster services.
    if enable:
        com_cmd = EnableCluster(env.report_processor)
        com_cmd.set_targets(new_nodes_target_list)
        run_and_raise(env.get_node_communicator(), com_cmd)
    if start:
        _start_cluster(
            env.communicator_factory,
            env.report_processor,
            new_nodes_target_list,
            wait_timeout=wait_timeout,
        )
Exemplo n.º 27
0
def full_cluster_status_plaintext(
    env: LibraryEnvironment,
    hide_inactive_resources: bool = False,
    verbose: bool = False,
) -> str:
    """
    Return full cluster status as plaintext

    env -- LibraryEnvironment
    hide_inactive_resources -- if True, do not display non-running resources
    verbose -- if True, display more info
    """
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-locals

    # validation
    if not env.is_cib_live and env.is_corosync_conf_live:
        raise LibraryError(
            reports.live_environment_not_consistent(
                [file_type_codes.CIB],
                [file_type_codes.COROSYNC_CONF],
            ))
    if env.is_cib_live and not env.is_corosync_conf_live:
        raise LibraryError(
            reports.live_environment_not_consistent(
                [file_type_codes.COROSYNC_CONF],
                [file_type_codes.CIB],
            ))

    # initialization
    runner = env.cmd_runner()
    report_processor = SimpleReportProcessor(env.report_processor)
    live = env.is_cib_live and env.is_corosync_conf_live
    is_sbd_running = False

    # load status, cib, corosync.conf
    status_text, warning_list = get_cluster_status_text(
        runner, hide_inactive_resources, verbose)
    corosync_conf = env.get_corosync_conf()
    cib = env.get_cib()
    if verbose:
        ticket_status_text, ticket_status_stderr, ticket_status_retval = (
            get_ticket_status_text(runner))
    # get extra info if live
    if live:
        try:
            is_sbd_running = is_service_running(runner, get_sbd_service_name())
        except LibraryError:
            pass
        local_services_status = _get_local_services_status(runner)
        if verbose:
            node_name_list, node_names_report_list = get_existing_nodes_names(
                corosync_conf)
            report_processor.report_list(node_names_report_list)
            node_reachability = _get_node_reachability(
                env.get_node_target_factory(),
                env.get_node_communicator(),
                report_processor,
                node_name_list,
            )

    # check stonith configuration
    warning_list = list(warning_list)
    warning_list.extend(_stonith_warnings(cib, is_sbd_running))

    # put it all together
    if report_processor.has_errors:
        raise LibraryError()

    parts = []
    parts.append(f"Cluster name: {corosync_conf.get_cluster_name()}")
    if warning_list:
        parts.extend(["", "WARNINGS:"] + warning_list + [""])
    parts.append(status_text)
    if verbose:
        parts.extend(["", "Tickets:"])
        if ticket_status_retval != 0:
            ticket_warning_parts = [
                "WARNING: Unable to get information about tickets"
            ]
            if ticket_status_stderr:
                ticket_warning_parts.extend(
                    indent(ticket_status_stderr.splitlines()))
            parts.extend(indent(ticket_warning_parts))
        else:
            parts.extend(indent(ticket_status_text.splitlines()))
    if live:
        if verbose:
            parts.extend(["", "PCSD Status:"])
            parts.extend(
                indent(
                    _format_node_reachability(node_name_list,
                                              node_reachability)))
        parts.extend(["", "Daemon Status:"])
        parts.extend(
            indent(_format_local_services_status(local_services_status)))
    return "\n".join(parts)
Exemplo n.º 28
0
def remove_nodes(env, node_list, force_quorum_loss=False, skip_offline=False):
    """
    Remove nodes from a cluster.

    env LibraryEnvironment
    node_list iterable -- names of nodes to remove
    force_quorum_loss bool -- treat quorum loss as a warning if True
    skip_offline bool -- treat unreachable nodes as warnings if True
    """
    _ensure_live_env(env)  # raises if env is not live

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    corosync_conf = env.get_corosync_conf()
    cluster_nodes_names = corosync_conf.get_nodes_names()

    # validations

    report_processor.report_list(
        config_validators.remove_nodes(
            node_list,
            corosync_conf.get_nodes(),
            corosync_conf.get_quorum_device_settings(),
        ))
    if report_processor.has_errors:
        # If there is an error, there is usually not much sense in doing other
        # validations:
        # - if there would be no node left in the cluster, it's pointless
        #   to check for quorum loss or if at least one remaining node is online
        # - if only one node is being removed and it doesn't exist, it's again
        #   pointless to check for other issues
        raise LibraryError()

    target_report_list, cluster_nodes_target_list = (
        target_factory.get_target_list_with_reports(
            cluster_nodes_names,
            skip_non_existing=skip_offline,
        ))
    known_nodes = set([target.label for target in cluster_nodes_target_list])
    unknown_nodes = set(
        [name for name in cluster_nodes_names if name not in known_nodes])
    report_processor.report_list(target_report_list)

    com_cmd = GetOnlineTargets(
        report_processor,
        ignore_offline_targets=skip_offline,
    )
    com_cmd.set_targets(cluster_nodes_target_list)
    online_target_list = run_com(env.get_node_communicator(), com_cmd)
    offline_target_list = [
        target for target in cluster_nodes_target_list
        if target not in online_target_list
    ]
    staying_online_target_list = [
        target for target in online_target_list
        if target.label not in node_list
    ]
    targets_to_remove = [
        target for target in cluster_nodes_target_list
        if target.label in node_list
    ]
    if not staying_online_target_list:
        report_processor.report(
            reports.unable_to_connect_to_any_remaining_node())
        # If no remaining node is online, there is no point in checking quorum
        # loss or anything as we would just get errors.
        raise LibraryError()

    if skip_offline:
        staying_offline_nodes = ([
            target.label
            for target in offline_target_list if target.label not in node_list
        ] + [name for name in unknown_nodes if name not in node_list])
        if staying_offline_nodes:
            report_processor.report(
                reports.unable_to_connect_to_all_remaining_node(
                    staying_offline_nodes))

    atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(),
                                                      corosync_conf,
                                                      -len(node_list))
    if atb_has_to_be_enabled:
        report_processor.report(
            reports.corosync_quorum_atb_will_be_enabled_due_to_sbd())
        com_cmd = CheckCorosyncOffline(
            report_processor,
            allow_skip_offline=False,
        )
        com_cmd.set_targets(staying_online_target_list)
        run_com(env.get_node_communicator(), com_cmd)
    else:
        # Check if removing the nodes would cause quorum loss. We ask the nodes
        # to be removed for their view of quorum. If they are all stopped or
        # not in a quorate partition, their removal cannot cause quorum loss.
        # That's why we ask them and not the remaining nodes.
        # example: 5-node cluster, 3 online nodes, removing one online node,
        # results in 4-node cluster with 2 online nodes => quorum lost
        # Check quorum loss only if ATB does not need to be enabled. If it is
        # required, cluster has to be turned off and therefore it loses quorum.
        forceable_report_creator = reports.get_problem_creator(
            report_codes.FORCE_QUORUM_LOSS, force_quorum_loss)
        com_cmd = cluster.GetQuorumStatus(report_processor)
        com_cmd.set_targets(targets_to_remove)
        failures, quorum_status = run_com(env.get_node_communicator(), com_cmd)
        if quorum_status:
            if quorum_status.stopping_nodes_cause_quorum_loss(node_list):
                report_processor.report(
                    forceable_report_creator(
                        reports.corosync_quorum_will_be_lost))
        elif failures or not targets_to_remove:
            report_processor.report(
                forceable_report_creator(
                    reports.corosync_quorum_loss_unable_to_check, ))

    if report_processor.has_errors:
        raise LibraryError()

    # validations done

    unknown_to_remove = [name for name in unknown_nodes if name in node_list]
    if unknown_to_remove:
        report_processor.report(
            reports.nodes_to_remove_unreachable(unknown_to_remove))
    if targets_to_remove:
        com_cmd = cluster.DestroyWarnOnFailure(report_processor)
        com_cmd.set_targets(targets_to_remove)
        run_and_raise(env.get_node_communicator(), com_cmd)

    corosync_conf.remove_nodes(node_list)
    if atb_has_to_be_enabled:
        corosync_conf.set_quorum_options(dict(auto_tie_breaker="1"))

    com_cmd = DistributeCorosyncConf(
        env.report_processor,
        corosync_conf.config.export(),
        allow_skip_offline=False,
    )
    com_cmd.set_targets(staying_online_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    com_cmd = ReloadCorosyncConf(env.report_processor)
    com_cmd.set_targets(staying_online_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)

    # try to remove nodes from pcmk using crm_node -R <node> --force and if not
    # successful remove it directly from CIB file on all nodes in parallel
    com_cmd = RemoveNodesFromCib(env.report_processor, node_list)
    com_cmd.set_targets(staying_online_target_list)
    run_and_raise(env.get_node_communicator(), com_cmd)
Exemplo n.º 29
0
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None:
    """
    Set up disaster recovery with the local cluster being the primary site

    env
    node_name -- a known host from the recovery site
    """
    if env.ghost_file_codes:
        raise LibraryError(
            reports.live_environment_required(env.ghost_file_codes))
    report_processor = SimpleReportProcessor(env.report_processor)
    dr_env = env.get_dr_env()
    if dr_env.config.raw_file.exists():
        report_processor.report(reports.dr_config_already_exist())
    target_factory = env.get_node_target_factory()

    local_nodes, report_list = get_existing_nodes_names(
        env.get_corosync_conf(), error_on_missing_name=True)
    report_processor.report_list(report_list)

    if node_name in local_nodes:
        report_processor.report(reports.node_in_local_cluster(node_name))

    report_list, local_targets = target_factory.get_target_list_with_reports(
        local_nodes, allow_skip=False, report_none_host_found=False)
    report_processor.report_list(report_list)

    report_list, remote_targets = (target_factory.get_target_list_with_reports(
        [node_name], allow_skip=False, report_none_host_found=False))
    report_processor.report_list(report_list)

    if report_processor.has_errors:
        raise LibraryError()

    com_cmd = GetCorosyncConf(env.report_processor)
    com_cmd.set_targets(remote_targets)
    remote_cluster_nodes, report_list = get_existing_nodes_names(
        CorosyncConfigFacade.from_string(
            run_and_raise(env.get_node_communicator(), com_cmd)),
        error_on_missing_name=True)
    if report_processor.report_list(report_list):
        raise LibraryError()

    # ensure we have tokens for all nodes of remote cluster
    report_list, remote_targets = target_factory.get_target_list_with_reports(
        remote_cluster_nodes, allow_skip=False, report_none_host_found=False)
    if report_processor.report_list(report_list):
        raise LibraryError()
    dr_config_exporter = (get_file_toolbox(
        file_type_codes.PCS_DR_CONFIG).exporter)
    # create dr config for remote cluster
    remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY)
    remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes)
    # send config to all node of remote cluster
    distribute_file_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.pcs_dr_config_file(
            dr_config_exporter.export(remote_dr_cfg.config)))
    distribute_file_cmd.set_targets(remote_targets)
    run_and_raise(env.get_node_communicator(), distribute_file_cmd)
    # create new dr config, with local cluster as primary site
    local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY)
    local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes)
    distribute_file_cmd = DistributeFilesWithoutForces(
        env.report_processor,
        node_communication_format.pcs_dr_config_file(
            dr_config_exporter.export(local_dr_cfg.config)))
    distribute_file_cmd.set_targets(local_targets)
    run_and_raise(env.get_node_communicator(), distribute_file_cmd)
Exemplo n.º 30
0
def node_add_remote(
    env, node_name, node_addr, operations, meta_attributes, instance_attributes,
    skip_offline_nodes=False,
    allow_incomplete_distribution=False,
    allow_pacemaker_remote_service_fail=False,
    allow_invalid_operation=False,
    allow_invalid_instance_attributes=False,
    use_default_operations=True,
    wait=False,
):
    # pylint: disable=too-many-arguments, too-many-branches, too-many-locals
    """
    create an ocf:pacemaker:remote resource and use it as a remote node

    LibraryEnvironment env -- provides all for communication with externals
    string node_name -- the name of the new node
    mixed node_addr -- the address of the new node or None for default
    list of dict operations -- attributes for each entered operation
    dict meta_attributes -- attributes for primitive/meta_attributes
    dict instance_attributes -- attributes for primitive/instance_attributes
    bool skip_offline_nodes -- if True, ignore when some nodes are offline
    bool allow_incomplete_distribution -- if True, allow this command to
        finish successfully even if file distribution did not succeed
    bool allow_pacemaker_remote_service_fail -- if True, allow this command to
        finish successfully even if starting/enabling pacemaker_remote did not
        succeed
    bool allow_invalid_operation -- if True, allow to use operations that
        are not listed in a resource agent metadata
    bool allow_invalid_instance_attributes -- if True, allow to use instance
        attributes that are not listed in a resource agent metadata and allow to
        omit required instance_attributes
    bool use_default_operations -- if True, add operations specified in
        a resource agent metadata to the resource
    mixed wait -- a flag for controlling waiting for pacemaker idle mechanism
    """
    env.ensure_wait_satisfiable(wait)

    report_processor = SimpleReportProcessor(env.report_processor)
    target_factory = env.get_node_target_factory()
    cib = env.get_cib()
    id_provider = IdProvider(cib)
    if env.is_cib_live:
        corosync_conf = env.get_corosync_conf()
    else:
        corosync_conf = None
        report_processor.report(
            reports.corosync_node_conflict_check_skipped("not_live_cib")
        )
    existing_nodes_names, existing_nodes_addrs, report_list = (
        get_existing_nodes_names_addrs(corosync_conf, cib)
    )
    if env.is_cib_live:
        # We just reported corosync checks are going to be skipped so we
        # shouldn't complain about errors related to corosync nodes
        report_processor.report_list(report_list)

    resource_agent = remote_node.get_agent(
        env.report_processor,
        env.cmd_runner()
    )

    existing_target_list = []
    if env.is_cib_live:
        existing_target_list, new_target_list = _get_targets_for_add(
            target_factory, report_processor, existing_nodes_names, [node_name],
            skip_offline_nodes
        )
        new_target = new_target_list[0] if new_target_list else None
        # default node_addr to an address from known-hosts
        if node_addr is None:
            node_addr = new_target.first_addr if new_target else node_name
            report_processor.report(
                reports.using_known_host_address_for_host(node_name, node_addr)
            )
    else:
        # default node_addr to an address from known-hosts
        if node_addr is None:
            known_hosts = env.get_known_hosts([node_name])
            node_addr = known_hosts[0].dest.addr if known_hosts else node_name
            report_processor.report(
                reports.using_known_host_address_for_host(node_name, node_addr)
            )

    # validate inputs
    report_list = remote_node.validate_create(
        existing_nodes_names,
        existing_nodes_addrs,
        resource_agent,
        node_name,
        node_addr,
        instance_attributes
    )
    # validation + cib setup
    # TODO extract the validation to a separate function
    try:
        remote_resource_element = remote_node.create(
            env.report_processor,
            resource_agent,
            get_resources(cib),
            id_provider,
            node_addr,
            node_name,
            operations,
            meta_attributes,
            instance_attributes,
            allow_invalid_operation,
            allow_invalid_instance_attributes,
            use_default_operations,
        )
    except LibraryError as e:
        #Check unique id conflict with check against nodes. Until validation
        #resource create is not separated, we need to make unique post
        #validation.
        already_exists = []
        unified_report_list = []
        for report in report_list + list(e.args):
            if report.code not in (
                report_codes.ID_ALREADY_EXISTS,
                report_codes.RESOURCE_INSTANCE_ATTR_VALUE_NOT_UNIQUE,
            ):
                unified_report_list.append(report)
            elif (
                "id" in report.info
                and
                report.info["id"] not in already_exists
            ):
                unified_report_list.append(report)
                already_exists.append(report.info["id"])
        report_list = unified_report_list

    report_processor.report_list(report_list)
    if report_processor.has_errors:
        raise LibraryError()

    # everything validated, let's set it up
    if env.is_cib_live:
        _prepare_pacemaker_remote_environment(
            env,
            report_processor,
            existing_target_list,
            new_target,
            node_name,
            skip_offline_nodes,
            allow_incomplete_distribution,
            allow_pacemaker_remote_service_fail,
        )
    else:
        report_processor.report_list(
            _reports_skip_new_node(node_name, "not_live_cib")
        )

    env.push_cib(wait=wait)
    if wait:
        _ensure_resource_running(env, remote_resource_element.attrib["id"])