def config_ticket_remove(env, ticket_name, instance_name=None): """ remove a ticket from booth configuration LibraryEnvironment env string ticket_name -- the name of the ticket to be removed string instance_name -- booth instance name """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) try: booth_conf = booth_env.config.read_to_facade() report_processor.report_list( config_validators.remove_ticket(booth_conf, ticket_name)) if report_processor.has_errors: raise LibraryError() booth_conf.remove_ticket(ticket_name) booth_env.config.write_facade(booth_conf, can_overwrite=True) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list(e)) if report_processor.has_errors: raise LibraryError()
def config_ticket_add(env, ticket_name, options, instance_name=None, allow_unknown_options=False): """ add a ticket to booth configuration LibraryEnvironment env string ticket_name -- the name of the ticket to be created dict options -- options for the ticket string instance_name -- booth instance name bool allow_unknown_options -- allow using options unknown to pcs """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) try: booth_conf = booth_env.config.read_to_facade() report_processor.report_list( config_validators.add_ticket( booth_conf, ticket_name, options, allow_unknown_options=allow_unknown_options)) if report_processor.has_errors: raise LibraryError() booth_conf.add_ticket(ticket_name, options) booth_env.config.write_facade(booth_conf, can_overwrite=True) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list(e)) if report_processor.has_errors: raise LibraryError()
def synchronize_ssl_certificate(env, skip_offline=False): """ Send the local pcsd SSL cert and key to all full nodes in the local cluster. Consider the pcs Web UI is accessed via an IP running as a resource in the cluster. When the IP is moved, the user's browser connects to the new node and we want it to get the same certificate to make the transition a seamless experience (otherwise the browser display a warning that the certificate has changed). Using pcsd Web UI on remote and guest nodes is not supported (pcs/pcsd depends on the corosanc.conf file being present on the local node) so we send the cert only to corossync (== full stack) nodes. """ report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) report_processor.report_list(report_list) try: with open(settings.pcsd_cert_location, "r") as file: ssl_cert = file.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_CERT, file_path=settings.pcsd_cert_location, reason=format_environment_error(e), operation="read", )) try: with open(settings.pcsd_key_location, "r") as file: ssl_key = file.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_KEY, file_path=settings.pcsd_key_location, reason=format_environment_error(e), operation="read", )) target_report_list, target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline)) report_processor.report_list(target_report_list) if report_processor.has_errors: raise LibraryError() env.report_processor.process( reports.pcsd_ssl_cert_and_key_distribution_started( [target.label for target in target_list])) com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def send_all_config_to_node( communicator, reporter, target_list, rewrite_existing=False, skip_wrong_config=False ): """ Send all booth configs from default booth config directory and theri authfiles to specified node. communicator -- NodeCommunicator reporter -- report processor target_list list -- list of targets to which configs should be delivered rewrite_existing -- if True rewrite existing file skip_wrong_config -- if True skip local configs that are unreadable """ _reporter = SimpleReportProcessor(reporter) config_dict = booth_conf.read_configs(reporter, skip_wrong_config) if not config_dict: return _reporter.report(reports.booth_config_distribution_started()) file_list = [] for config, config_data in sorted(config_dict.items()): try: authfile_path = config_structure.get_authfile( config_parser.parse(config_data) ) file_list.append({ "name": config, "data": config_data, "is_authfile": False }) if authfile_path: content = booth_conf.read_authfile(reporter, authfile_path) if not content: continue file_list.append({ "name": os.path.basename(authfile_path), "data": base64.b64encode(content).decode("utf-8"), "is_authfile": True }) except LibraryError: _reporter.report(reports.booth_skipping_config( config, "unable to parse config" )) com_cmd = BoothSaveFiles( _reporter, file_list, rewrite_existing=rewrite_existing ) com_cmd.set_targets(target_list) run(communicator, com_cmd) if _reporter.has_errors: raise LibraryError()
def send_all_config_to_node(communicator, reporter, target_list, rewrite_existing=False, skip_wrong_config=False): """ Send all booth configs from default booth config directory and theri authfiles to specified node. communicator -- NodeCommunicator reporter -- report processor target_list list -- list of targets to which configs should be delivered rewrite_existing -- if True rewrite existing file skip_wrong_config -- if True skip local configs that are unreadable """ _reporter = SimpleReportProcessor(reporter) config_dict = booth_conf.read_configs(reporter, skip_wrong_config) if not config_dict: return _reporter.report(reports.booth_config_distribution_started()) file_list = [] for config, config_data in sorted(config_dict.items()): try: authfile_path = config_structure.get_authfile( config_parser.parse(config_data)) file_list.append({ "name": config, "data": config_data, "is_authfile": False }) if authfile_path: content = booth_conf.read_authfile(reporter, authfile_path) if not content: continue file_list.append({ "name": os.path.basename(authfile_path), "data": base64.b64encode(content).decode("utf-8"), "is_authfile": True }) except LibraryError: _reporter.report( reports.booth_skipping_config(config, "unable to parse config")) com_cmd = BoothSaveFiles(_reporter, file_list, rewrite_existing=rewrite_existing) com_cmd.set_targets(target_list) run(communicator, com_cmd) if _reporter.has_errors: raise LibraryError()
def config_sync(env, instance_name=None, skip_offline_nodes=False): """ Send specified local booth configuration to all nodes in the local cluster. LibraryEnvironment env string instance_name -- booth instance name skip_offline_nodes -- if True offline nodes will be skipped """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) if not env.is_cib_live: raise LibraryError( reports.live_environment_required([file_type_codes.CIB], )) cluster_nodes_names, report_list = get_existing_nodes_names( env.get_corosync_conf()) if not cluster_nodes_names: report_list.append(reports.corosync_config_no_nodes_defined()) report_processor.report_list(report_list) try: booth_conf_data = booth_env.config.read_raw() booth_conf = booth_env.config.raw_to_facade(booth_conf_data) if isinstance(booth_env.config.raw_file, GhostFile): authfile_data = booth_env.key.read_raw() authfile_path = booth_conf.get_authfile() authfile_name = (os.path.basename(authfile_path) if authfile_path else None) else: authfile_name, authfile_data, authfile_report_list = ( config_files.get_authfile_name_and_data(booth_conf)) report_processor.report_list(authfile_report_list) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list(e)) if report_processor.has_errors: raise LibraryError() com_cmd = BoothSendConfig(env.report_processor, booth_env.instance_name, booth_conf_data, authfile=authfile_name, authfile_data=authfile_data, skip_offline_targets=skip_offline_nodes) com_cmd.set_targets(env.get_node_target_factory().get_target_list( cluster_nodes_names, skip_non_existing=skip_offline_nodes, )) run_and_raise(env.get_node_communicator(), com_cmd)
def config_setup(env, site_list, arbitrator_list, instance_name=None, overwrite_existing=False): """ create booth configuration LibraryEnvironment env list site_list -- site adresses of multisite list arbitrator_list -- arbitrator adresses of multisite string instance_name -- booth instance name bool overwrite_existing -- allow overwriting existing files """ instance_name = instance_name or constants.DEFAULT_INSTANCE_NAME report_processor = SimpleReportProcessor(env.report_processor) report_processor.report_list( config_validators.check_instance_name(instance_name)) report_processor.report_list( config_validators.create(site_list, arbitrator_list)) if report_processor.has_errors: raise LibraryError() booth_env = env.get_booth_env(instance_name) booth_conf = booth_env.create_facade(site_list, arbitrator_list) booth_conf.set_authfile(booth_env.key_path) report_creator = reports.get_problem_creator( force_code=report_codes.FORCE_FILE_OVERWRITE, is_forced=overwrite_existing) try: booth_env.key.write_raw(tools.generate_binary_key( random_bytes_count=settings.booth_authkey_bytes), can_overwrite=overwrite_existing) booth_env.config.write_facade(booth_conf, can_overwrite=overwrite_existing) except FileAlreadyExists as e: report_processor.report( report_creator( reports.file_already_exists, e.metadata.file_type_code, e.metadata.path, )) except RawFileError as e: report_processor.report(raw_file_error_report(e)) if report_processor.has_errors: raise LibraryError()
def pull_config(env, node_name, instance_name=None): """ Get config from specified node and save it on local system. It will rewrite existing files. LibraryEnvironment env string node_name -- name of the node from which the config should be fetched string instance_name -- booth instance name """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name _ensure_live_env(env, booth_env) env.report_processor.process( booth_reports.booth_fetching_config_from_node_started( node_name, instance_name)) com_cmd = BoothGetConfig(env.report_processor, instance_name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. output = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: # TODO adapt to new file transfer framework once it is written if (output["authfile"]["name"] is not None and output["authfile"]["data"]): authfile_name = output["authfile"]["name"] report_list = config_validators.check_instance_name(authfile_name) if report_list: raise LibraryError(*report_list) booth_key = FileInstance.for_booth_key(authfile_name) booth_key.write_raw(base64.b64decode( output["authfile"]["data"].encode("utf-8")), can_overwrite=True) booth_env.config.write_raw(output["config"]["data"].encode("utf-8"), can_overwrite=True) env.report_processor.process( booth_reports.booth_config_accepted_by_node( name_list=[instance_name])) except RawFileError as e: report_processor.report(raw_file_error_report(e)) except KeyError: raise LibraryError(reports.invalid_response_format(node_name)) if report_processor.has_errors: raise LibraryError()
def config_text(env, instance_name=None, node_name=None): """ get configuration in raw format string instance_name -- booth instance name string node_name -- get the config from specified node or local host if None """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name # It does not make any sense for the cli to read a ghost file and send it # to lib so that the lib could return it unchanged to cli. Just use 'cat'. # When node_name is specified, using ghost files doesn't make any sense # either. _ensure_live_env(env, booth_env) if node_name is None: try: return booth_env.config.read_raw() except RawFileError as e: report_processor.report(raw_file_error_report(e)) if report_processor.has_errors: raise LibraryError() com_cmd = BoothGetConfig(env.report_processor, instance_name) com_cmd.set_targets( [env.get_node_target_factory().get_target_from_hostname(node_name)]) # pylint: disable=unsubscriptable-object # In general, pylint is right. And it cannot know in this case code is OK. # It is covered by tests. remote_data = run_and_raise(env.get_node_communicator(), com_cmd)[0][1] try: # TODO switch to new file transfer commands (not implemented yet) # which send and receive configs as bytes instead of strings return remote_data["config"]["data"].encode("utf-8") except KeyError: raise LibraryError(reports.invalid_response_format(node_name))
def remove_nodes(env, node_list, force_quorum_loss=False, skip_offline=False): """ Remove nodes from a cluster. env LibraryEnvironment node_list iterable -- names of nodes to remove force_quorum_loss bool -- treat quorum loss as a warning if True skip_offline bool -- treat unreachable nodes as warnings if True """ _ensure_live_env(env) # raises if env is not live report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() corosync_conf = env.get_corosync_conf() cluster_nodes_names = corosync_conf.get_nodes_names() # validations report_processor.report_list( config_validators.remove_nodes( node_list, corosync_conf.get_nodes(), corosync_conf.get_quorum_device_settings(), )) if report_processor.has_errors: # If there is an error, there is usually not much sense in doing other # validations: # - if there would be no node left in the cluster, it's pointless # to check for quorum loss or if at least one remaining node is online # - if only one node is being removed and it doesn't exist, it's again # pointless to check for other issues raise LibraryError() target_report_list, cluster_nodes_target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline, )) known_nodes = set([target.label for target in cluster_nodes_target_list]) unknown_nodes = set( [name for name in cluster_nodes_names if name not in known_nodes]) report_processor.report_list(target_report_list) com_cmd = GetOnlineTargets( report_processor, ignore_offline_targets=skip_offline, ) com_cmd.set_targets(cluster_nodes_target_list) online_target_list = run_com(env.get_node_communicator(), com_cmd) offline_target_list = [ target for target in cluster_nodes_target_list if target not in online_target_list ] staying_online_target_list = [ target for target in online_target_list if target.label not in node_list ] targets_to_remove = [ target for target in cluster_nodes_target_list if target.label in node_list ] if not staying_online_target_list: report_processor.report( reports.unable_to_connect_to_any_remaining_node()) # If no remaining node is online, there is no point in checking quorum # loss or anything as we would just get errors. raise LibraryError() if skip_offline: staying_offline_nodes = ([ target.label for target in offline_target_list if target.label not in node_list ] + [name for name in unknown_nodes if name not in node_list]) if staying_offline_nodes: report_processor.report( reports.unable_to_connect_to_all_remaining_node( staying_offline_nodes)) atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(), corosync_conf, -len(node_list)) if atb_has_to_be_enabled: report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) com_cmd = CheckCorosyncOffline( report_processor, allow_skip_offline=False, ) com_cmd.set_targets(staying_online_target_list) run_com(env.get_node_communicator(), com_cmd) else: # Check if removing the nodes would cause quorum loss. We ask the nodes # to be removed for their view of quorum. If they are all stopped or # not in a quorate partition, their removal cannot cause quorum loss. # That's why we ask them and not the remaining nodes. # example: 5-node cluster, 3 online nodes, removing one online node, # results in 4-node cluster with 2 online nodes => quorum lost # Check quorum loss only if ATB does not need to be enabled. If it is # required, cluster has to be turned off and therefore it loses quorum. forceable_report_creator = reports.get_problem_creator( report_codes.FORCE_QUORUM_LOSS, force_quorum_loss) com_cmd = cluster.GetQuorumStatus(report_processor) com_cmd.set_targets(targets_to_remove) failures, quorum_status = run_com(env.get_node_communicator(), com_cmd) if quorum_status: if quorum_status.stopping_nodes_cause_quorum_loss(node_list): report_processor.report( forceable_report_creator( reports.corosync_quorum_will_be_lost)) elif failures or not targets_to_remove: report_processor.report( forceable_report_creator( reports.corosync_quorum_loss_unable_to_check, )) if report_processor.has_errors: raise LibraryError() # validations done unknown_to_remove = [name for name in unknown_nodes if name in node_list] if unknown_to_remove: report_processor.report( reports.nodes_to_remove_unreachable(unknown_to_remove)) if targets_to_remove: com_cmd = cluster.DestroyWarnOnFailure(report_processor) com_cmd.set_targets(targets_to_remove) run_and_raise(env.get_node_communicator(), com_cmd) corosync_conf.remove_nodes(node_list) if atb_has_to_be_enabled: corosync_conf.set_quorum_options(dict(auto_tie_breaker="1")) com_cmd = DistributeCorosyncConf( env.report_processor, corosync_conf.config.export(), allow_skip_offline=False, ) com_cmd.set_targets(staying_online_target_list) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = ReloadCorosyncConf(env.report_processor) com_cmd.set_targets(staying_online_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # try to remove nodes from pcmk using crm_node -R <node> --force and if not # successful remove it directly from CIB file on all nodes in parallel com_cmd = RemoveNodesFromCib(env.report_processor, node_list) com_cmd.set_targets(staying_online_target_list) run_and_raise(env.get_node_communicator(), com_cmd)
def send_all_config_to_node(communicator, reporter, target_list, rewrite_existing=False, skip_wrong_config=False): """ Send all booth configs from default booth config directory and theri authfiles to specified node. communicator -- NodeCommunicator reporter -- report processor target_list list -- list of targets to which configs should be delivered rewrite_existing -- if True rewrite existing file skip_wrong_config -- if True skip local configs that are unreadable """ # TODO adapt to new file transfer framework once it is written # TODO the function is not modular enough - it raises LibraryError _reporter = SimpleReportProcessor(reporter) file_list = [] for conf_file_name in sorted(config_files.get_all_configs_file_names()): config_file = FileInstance.for_booth_config(conf_file_name) try: booth_conf_data = config_file.raw_file.read() authfile_name, authfile_data, authfile_report_list = ( config_files.get_authfile_name_and_data( config_file.raw_to_facade(booth_conf_data))) _reporter.report_list(authfile_report_list) file_list.append({ "name": conf_file_name, "data": booth_conf_data.decode("utf-8"), "is_authfile": False }) if authfile_name and authfile_data: file_list.append({ "name": authfile_name, "data": base64.b64encode(authfile_data).decode("utf-8"), "is_authfile": True }) except RawFileError as e: _reporter.report( raw_file_error_report( e, force_code=report_codes.SKIP_UNREADABLE_CONFIG, is_forced_or_warning=skip_wrong_config, )) except ParserErrorException as e: _reporter.report_list( config_file.parser_exception_to_report_list( e, force_code=report_codes.SKIP_UNREADABLE_CONFIG, is_forced_or_warning=skip_wrong_config, )) if _reporter.has_errors: raise LibraryError() if not file_list: # no booth configs exist, nothing to be synced return _reporter.report(booth_reports.booth_config_distribution_started()) com_cmd = BoothSaveFiles(_reporter, file_list, rewrite_existing=rewrite_existing) com_cmd.set_targets(target_list) run(communicator, com_cmd) if _reporter.has_errors: raise LibraryError()
def node_add_remote( env, node_name, node_addr, operations, meta_attributes, instance_attributes, skip_offline_nodes=False, allow_incomplete_distribution=False, allow_pacemaker_remote_service_fail=False, allow_invalid_operation=False, allow_invalid_instance_attributes=False, use_default_operations=True, wait=False, ): # pylint: disable=too-many-arguments, too-many-branches, too-many-locals """ create an ocf:pacemaker:remote resource and use it as a remote node LibraryEnvironment env -- provides all for communication with externals string node_name -- the name of the new node mixed node_addr -- the address of the new node or None for default list of dict operations -- attributes for each entered operation dict meta_attributes -- attributes for primitive/meta_attributes dict instance_attributes -- attributes for primitive/instance_attributes bool skip_offline_nodes -- if True, ignore when some nodes are offline bool allow_incomplete_distribution -- if True, allow this command to finish successfully even if file distribution did not succeed bool allow_pacemaker_remote_service_fail -- if True, allow this command to finish successfully even if starting/enabling pacemaker_remote did not succeed bool allow_invalid_operation -- if True, allow to use operations that are not listed in a resource agent metadata bool allow_invalid_instance_attributes -- if True, allow to use instance attributes that are not listed in a resource agent metadata and allow to omit required instance_attributes bool use_default_operations -- if True, add operations specified in a resource agent metadata to the resource mixed wait -- a flag for controlling waiting for pacemaker idle mechanism """ env.ensure_wait_satisfiable(wait) report_processor = SimpleReportProcessor(env.report_processor) cib = env.get_cib() id_provider = IdProvider(cib) if env.is_cib_live: corosync_conf = env.get_corosync_conf() else: corosync_conf = None report_processor.report( reports.corosync_node_conflict_check_skipped("not_live_cib")) existing_nodes_names, existing_nodes_addrs, report_list = ( get_existing_nodes_names_addrs(corosync_conf, cib)) if env.is_cib_live: # We just reported corosync checks are going to be skipped so we # shouldn't complain about errors related to corosync nodes report_processor.report_list(report_list) resource_agent = remote_node.get_agent(env.report_processor, env.cmd_runner()) existing_target_list = [] if env.is_cib_live: target_factory = env.get_node_target_factory() existing_target_list, new_target_list = _get_targets_for_add( target_factory, report_processor, existing_nodes_names, [node_name], skip_offline_nodes) new_target = new_target_list[0] if new_target_list else None # default node_addr to an address from known-hosts if node_addr is None: node_addr = new_target.first_addr if new_target else node_name report_processor.report( reports.using_known_host_address_for_host( node_name, node_addr)) else: # default node_addr to an address from known-hosts if node_addr is None: known_hosts = env.get_known_hosts([node_name]) node_addr = known_hosts[0].dest.addr if known_hosts else node_name report_processor.report( reports.using_known_host_address_for_host( node_name, node_addr)) # validate inputs report_list = remote_node.validate_create(existing_nodes_names, existing_nodes_addrs, resource_agent, node_name, node_addr, instance_attributes) # validation + cib setup # TODO extract the validation to a separate function try: remote_resource_element = remote_node.create( env.report_processor, resource_agent, get_resources(cib), id_provider, node_addr, node_name, operations, meta_attributes, instance_attributes, allow_invalid_operation, allow_invalid_instance_attributes, use_default_operations, ) except LibraryError as e: #Check unique id conflict with check against nodes. Until validation #resource create is not separated, we need to make unique post #validation. already_exists = [] unified_report_list = [] for report in report_list + list(e.args): if report.code not in ( report_codes.ID_ALREADY_EXISTS, report_codes.RESOURCE_INSTANCE_ATTR_VALUE_NOT_UNIQUE, ): unified_report_list.append(report) elif ("id" in report.info and report.info["id"] not in already_exists): unified_report_list.append(report) already_exists.append(report.info["id"]) report_list = unified_report_list report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # everything validated, let's set it up if env.is_cib_live: _prepare_pacemaker_remote_environment( env, report_processor, existing_target_list, new_target, node_name, skip_offline_nodes, allow_incomplete_distribution, allow_pacemaker_remote_service_fail, ) else: report_processor.report_list( _reports_skip_new_node(node_name, "not_live_cib")) env.push_cib(wait=wait) if wait: _ensure_resource_running(env, remote_resource_element.attrib["id"])
def create_in_cluster(env, ip, instance_name=None, allow_absent_resource_agent=False): """ Create group with ip resource and booth resource LibraryEnvironment env -- provides all for communication with externals string ip -- float ip address for the operation of the booth string instance_name -- booth instance name bool allow_absent_resource_agent -- allowing creating booth resource even if its agent is not installed """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) # Booth config path goes to CIB. Working with a mocked booth configs would # not work coorectly as the path would point to a mock file (the path to a # mock file is unknown to us in the lib anyway) # It makes sense to work with a mocked CIB, though. Users can do other # changes to the CIB and push them to the cluster at once. _ensure_live_booth_env(booth_env) resources_section = get_resources(env.get_cib()) id_provider = IdProvider(resources_section) instance_name = booth_env.instance_name # validate if resource.find_for_config(resources_section, booth_env.config_path): report_processor.report( booth_reports.booth_already_in_cib(instance_name)) # verify the config exists and is readable try: booth_env.config.raw_file.read() except RawFileError as e: report_processor.report(raw_file_error_report(e)) if report_processor.has_errors: raise LibraryError() # validation done create_id = partial(resource.create_resource_id, resources_section, instance_name) get_agent = partial(find_valid_resource_agent_by_name, env.report_processor, env.cmd_runner(), allowed_absent=allow_absent_resource_agent) create_primitive = partial(primitive.create, env.report_processor, resources_section, id_provider) into_booth_group = partial( group.place_resource, group.provide_group(resources_section, create_id("group")), ) into_booth_group( create_primitive( create_id("ip"), get_agent("ocf:heartbeat:IPaddr2"), instance_attributes={"ip": ip}, )) into_booth_group( create_primitive( create_id("service"), get_agent("ocf:pacemaker:booth-site"), instance_attributes={"config": booth_env.config_path}, )) env.push_cib()
def node_add_guest( env, node_name, resource_id, options, skip_offline_nodes=False, allow_incomplete_distribution=False, allow_pacemaker_remote_service_fail=False, wait=False, ): # pylint: disable=too-many-locals """ Make a guest node from the specified resource LibraryEnvironment env -- provides all for communication with externals string node_name -- name of the guest node string resource_id -- specifies resource that should become a guest node dict options -- guest node options (remote-port, remote-addr, remote-connect-timeout) bool skip_offline_nodes -- if True, ignore when some nodes are offline bool allow_incomplete_distribution -- if True, allow this command to finish successfully even if file distribution did not succeed bool allow_pacemaker_remote_service_fail -- if True, allow this command to finish successfully even if starting/enabling pacemaker_remote did not succeed mixed wait -- a flag for controlling waiting for pacemaker idle mechanism """ env.ensure_wait_satisfiable(wait) report_processor = SimpleReportProcessor(env.report_processor) cib = env.get_cib() id_provider = IdProvider(cib) if env.is_cib_live: corosync_conf = env.get_corosync_conf() else: corosync_conf = None report_processor.report( reports.corosync_node_conflict_check_skipped("not_live_cib")) existing_nodes_names, existing_nodes_addrs, report_list = ( get_existing_nodes_names_addrs(corosync_conf, cib)) if env.is_cib_live: # We just reported corosync checks are going to be skipped so we # shouldn't complain about errors related to corosync nodes report_processor.report_list(report_list) existing_target_list = [] if env.is_cib_live: target_factory = env.get_node_target_factory() existing_target_list, new_target_list = _get_targets_for_add( target_factory, report_processor, existing_nodes_names, [node_name], skip_offline_nodes) new_target = new_target_list[0] if new_target_list else None # default remote-addr to an address from known-hosts if "remote-addr" not in options or options["remote-addr"] is None: new_addr = new_target.first_addr if new_target else node_name options["remote-addr"] = new_addr report_processor.report( reports.using_known_host_address_for_host(node_name, new_addr)) else: # default remote-addr to an address from known-hosts if "remote-addr" not in options or options["remote-addr"] is None: known_hosts = env.get_known_hosts([node_name]) new_addr = known_hosts[0].dest.addr if known_hosts else node_name options["remote-addr"] = new_addr report_processor.report( reports.using_known_host_address_for_host(node_name, new_addr)) # validate inputs report_list = guest_node.validate_set_as_guest(cib, existing_nodes_names, existing_nodes_addrs, node_name, options) searcher = ElementSearcher(primitive.TAG, resource_id, get_resources(cib)) if searcher.element_found(): resource_element = searcher.get_element() report_list.extend(guest_node.validate_is_not_guest(resource_element)) else: report_list.extend(searcher.get_errors()) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # everything validated, let's set it up guest_node.set_as_guest( resource_element, id_provider, node_name, options.get("remote-addr", None), options.get("remote-port", None), options.get("remote-connect-timeout", None), ) if env.is_cib_live: _prepare_pacemaker_remote_environment( env, report_processor, existing_target_list, new_target, node_name, skip_offline_nodes, allow_incomplete_distribution, allow_pacemaker_remote_service_fail, ) else: report_processor.report_list( _reports_skip_new_node(node_name, "not_live_cib")) env.push_cib(wait=wait) if wait: _ensure_resource_running(env, resource_id)
def setup(env, cluster_name, nodes, transport_type=None, transport_options=None, link_list=None, compression_options=None, crypto_options=None, totem_options=None, quorum_options=None, wait=False, start=False, enable=False, force=False, force_unresolvable=False): """ Set up cluster on specified nodes. Validation of the inputs is done here. Possible existing clusters are destroyed (when using force). Authkey files for corosync and pacemaer, known hosts and and newly generated corosync.conf are distributed to all nodes. Raise LibraryError on any error. env LibraryEnvironment cluster_name string -- name of a cluster to set up nodes list -- list of dicts which represents node. Supported keys are: name (required), addrs transport_type string -- transport type of a cluster transport_options dict -- transport specific options link_list list of dict -- list of links, depends of transport_type compression_options dict -- only available for transport_type == 'knet'. In corosync.conf they are prefixed 'knet_compression_' crypto_options dict -- only available for transport_type == 'knet'. In corosync.conf they are prefixed 'crypto_' totem_options dict -- options of section 'totem' in corosync.conf quorum_options dict -- options of section 'quorum' in corosync.conf wait -- specifies if command should try to wait for cluster to start up. Has no effect start is False. If set to False command will not wait for cluster to start. If None command will wait for some default timeout. If int wait set timeout to int value of seconds. start bool -- if True start cluster when it is set up enable bool -- if True enable cluster when it is set up force bool -- if True some validations errors are treated as warnings force_unresolvable bool -- if True not resolvable addresses of nodes are treated as warnings """ _ensure_live_env(env) # raises if env is not live transport_options = transport_options or {} link_list = link_list or [] compression_options = compression_options or {} crypto_options = crypto_options or {} totem_options = totem_options or {} quorum_options = quorum_options or {} nodes = [_normalize_dict(node, {"addrs"}) for node in nodes] report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() # Get targets for all nodes and report unknown (== not-authorized) nodes. # If a node doesn't contain the 'name' key, validation of inputs reports it. # That means we don't report missing names but cannot rely on them being # present either. target_report_list, target_list = ( target_factory.get_target_list_with_reports( [node["name"] for node in nodes if "name" in node], allow_skip=False, )) report_processor.report_list(target_report_list) # Use an address defined in known-hosts for each node with no addresses # specified. This allows users not to specify node addresses at all which # simplifies the whole cluster setup command / form significantly. addrs_defaulter = _get_addrs_defaulter( report_processor, {target.label: target for target in target_list}) nodes = [ _set_defaults_in_dict(node, {"addrs": addrs_defaulter}) for node in nodes ] # Validate inputs. report_processor.report_list( config_validators.create(cluster_name, nodes, transport_type, force_unresolvable=force_unresolvable)) if transport_type in corosync_constants.TRANSPORTS_KNET: max_link_number = max([len(node["addrs"]) for node in nodes], default=0) report_processor.report_list( config_validators.create_transport_knet( transport_options, compression_options, crypto_options) + config_validators.create_link_list_knet(link_list, max_link_number) ) elif transport_type in corosync_constants.TRANSPORTS_UDP: report_processor.report_list( config_validators.create_transport_udp( transport_options, compression_options, crypto_options) + config_validators.create_link_list_udp(link_list)) report_processor.report_list( config_validators.create_totem(totem_options) + # We are creating the config and we know there is no qdevice in it. config_validators.create_quorum_options(quorum_options, False)) # Validate flags wait_timeout = _get_validated_wait_timeout(report_processor, wait, start) # Validate the nodes com_cmd = GetHostInfo(report_processor) com_cmd.set_targets(target_list) report_processor.report_list( _host_check_cluster_setup( run_com(env.get_node_communicator(), com_cmd), force)) if report_processor.has_errors: raise LibraryError() # Validation done. If errors occured, an exception has been raised and we # don't get below this line. # Destroy cluster on all nodes. com_cmd = cluster.Destroy(env.report_processor) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Distribute auth tokens. com_cmd = UpdateKnownHosts( env.report_processor, known_hosts_to_add=env.get_known_hosts( [target.label for target in target_list]), known_hosts_to_remove=[], ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Distribute configuration files except corosync.conf. Sending # corosync.conf serves as a "commit" as its presence on a node marks the # node as a part of a cluster. corosync_authkey = generate_binary_key(random_bytes_count=128) pcmk_authkey = generate_binary_key(random_bytes_count=128) actions = {} actions.update( node_communication_format.corosync_authkey_file(corosync_authkey)) actions.update(node_communication_format.pcmk_authkey_file(pcmk_authkey)) com_cmd = DistributeFilesWithoutForces(env.report_processor, actions) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) # TODO This should be in the previous call but so far we don't have a call # which allows to save and delete files at the same time. com_cmd = RemoveFilesWithoutForces( env.report_processor, {"pcsd settings": { "type": "pcsd_settings" }}, ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Distribute and reload pcsd SSL certificate report_processor.report( reports.pcsd_ssl_cert_and_key_distribution_started( [target.label for target in target_list])) ssl_key_raw = ssl.generate_key() ssl_key = ssl.dump_key(ssl_key_raw) ssl_cert = ssl.dump_cert( ssl.generate_cert(ssl_key_raw, target_list[0].label)) com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Create and distribute corosync.conf. Once a node saves corosync.conf it # is considered to be in a cluster. corosync_conf = config_facade.ConfigFacade.create(cluster_name, nodes, transport_type) corosync_conf.set_totem_options(totem_options) corosync_conf.set_quorum_options(quorum_options) corosync_conf.create_link_list(link_list) if transport_type in corosync_constants.TRANSPORTS_KNET: corosync_conf.set_transport_knet_options(transport_options, compression_options, crypto_options) elif transport_type in corosync_constants.TRANSPORTS_UDP: corosync_conf.set_transport_udp_options(transport_options) com_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.corosync_conf_file( corosync_conf.config.export()), ) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) env.report_processor.process(reports.cluster_setup_success()) # Optionally enable and start cluster services. if enable: com_cmd = EnableCluster(env.report_processor) com_cmd.set_targets(target_list) run_and_raise(env.get_node_communicator(), com_cmd) if start: _start_cluster( env.communicator_factory, env.report_processor, target_list, wait_timeout=wait_timeout, )
def node_add_guest( env, node_name, resource_id, options, skip_offline_nodes=False, allow_incomplete_distribution=False, allow_pacemaker_remote_service_fail=False, wait=False, ): # pylint: disable=too-many-locals """ Make a guest node from the specified resource LibraryEnvironment env -- provides all for communication with externals string node_name -- name of the guest node string resource_id -- specifies resource that should become a guest node dict options -- guest node options (remote-port, remote-addr, remote-connect-timeout) bool skip_offline_nodes -- if True, ignore when some nodes are offline bool allow_incomplete_distribution -- if True, allow this command to finish successfully even if file distribution did not succeed bool allow_pacemaker_remote_service_fail -- if True, allow this command to finish successfully even if starting/enabling pacemaker_remote did not succeed mixed wait -- a flag for controlling waiting for pacemaker idle mechanism """ env.ensure_wait_satisfiable(wait) report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() cib = env.get_cib() id_provider = IdProvider(cib) if env.is_cib_live: corosync_conf = env.get_corosync_conf() else: corosync_conf = None report_processor.report( reports.corosync_node_conflict_check_skipped("not_live_cib") ) existing_nodes_names, existing_nodes_addrs, report_list = ( get_existing_nodes_names_addrs(corosync_conf, cib) ) if env.is_cib_live: # We just reported corosync checks are going to be skipped so we # shouldn't complain about errors related to corosync nodes report_processor.report_list(report_list) existing_target_list = [] if env.is_cib_live: existing_target_list, new_target_list = _get_targets_for_add( target_factory, report_processor, existing_nodes_names, [node_name], skip_offline_nodes ) new_target = new_target_list[0] if new_target_list else None # default remote-addr to an address from known-hosts if "remote-addr" not in options or options["remote-addr"] is None: new_addr = new_target.first_addr if new_target else node_name options["remote-addr"] = new_addr report_processor.report( reports.using_known_host_address_for_host(node_name, new_addr) ) else: # default remote-addr to an address from known-hosts if "remote-addr" not in options or options["remote-addr"] is None: known_hosts = env.get_known_hosts([node_name]) new_addr = known_hosts[0].dest.addr if known_hosts else node_name options["remote-addr"] = new_addr report_processor.report( reports.using_known_host_address_for_host(node_name, new_addr) ) # validate inputs report_list = guest_node.validate_set_as_guest( cib, existing_nodes_names, existing_nodes_addrs, node_name, options ) searcher = ElementSearcher(primitive.TAG, resource_id, get_resources(cib)) if searcher.element_found(): resource_element = searcher.get_element() report_list.extend(guest_node.validate_is_not_guest(resource_element)) else: report_list.extend(searcher.get_errors()) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # everything validated, let's set it up guest_node.set_as_guest( resource_element, id_provider, node_name, options.get("remote-addr", None), options.get("remote-port", None), options.get("remote-connect-timeout", None), ) if env.is_cib_live: _prepare_pacemaker_remote_environment( env, report_processor, existing_target_list, new_target, node_name, skip_offline_nodes, allow_incomplete_distribution, allow_pacemaker_remote_service_fail, ) else: report_processor.report_list( _reports_skip_new_node(node_name, "not_live_cib") ) env.push_cib(wait=wait) if wait: _ensure_resource_running(env, resource_id)
def config_destroy(env, instance_name=None, ignore_config_load_problems=False): # pylint: disable=too-many-branches """ remove booth configuration files LibraryEnvironment env string instance_name -- booth instance name bool ignore_config_load_problems -- delete as much as possible when unable to read booth configs for the given booth instance """ report_processor = SimpleReportProcessor(env.report_processor) booth_env = env.get_booth_env(instance_name) instance_name = booth_env.instance_name _ensure_live_env(env, booth_env) # TODO use constants in reports config_is_used = partial(booth_reports.booth_config_is_used, instance_name) if resource.find_for_config( get_resources(env.get_cib()), booth_env.config_path, ): report_processor.report(config_is_used("in cluster resource")) # Only systemd is currently supported. Initd does not supports multiple # instances (here specified by name) if external.is_systemctl(): if external.is_service_running(env.cmd_runner(), "booth", instance_name): report_processor.report(config_is_used("(running in systemd)")) if external.is_service_enabled(env.cmd_runner(), "booth", instance_name): report_processor.report(config_is_used("(enabled in systemd)")) if report_processor.has_errors: raise LibraryError() try: authfile_path = None booth_conf = booth_env.config.read_to_facade() authfile_path = booth_conf.get_authfile() except RawFileError as e: report_processor.report( raw_file_error_report( e, force_code=report_codes.FORCE_BOOTH_DESTROY, is_forced_or_warning=ignore_config_load_problems, )) except ParserErrorException as e: report_processor.report_list( booth_env.config.parser_exception_to_report_list( e, force_code=report_codes.FORCE_BOOTH_DESTROY, is_forced_or_warning=ignore_config_load_problems, )) if report_processor.has_errors: raise LibraryError() if authfile_path: authfile_dir, authfile_name = os.path.split(authfile_path) if (authfile_dir == settings.booth_config_dir) and authfile_name: try: key_file = FileInstance.for_booth_key(authfile_name) key_file.raw_file.remove(fail_if_file_not_found=False) except RawFileError as e: report_processor.report( raw_file_error_report( e, force_code=report_codes.FORCE_BOOTH_DESTROY, is_forced_or_warning=ignore_config_load_problems, )) else: report_processor.report( booth_reports.booth_unsupported_file_location( authfile_path, settings.booth_config_dir, file_type_codes.BOOTH_KEY, )) if report_processor.has_errors: raise LibraryError() try: booth_env.config.raw_file.remove() except RawFileError as e: report_processor.report(raw_file_error_report(e)) if report_processor.has_errors: raise LibraryError()
def add_nodes( env, nodes, wait=False, start=False, enable=False, force=False, force_unresolvable=False, skip_offline_nodes=False, no_watchdog_validation=False, ): # pylint: disable=too-many-locals """ Add specified nodes to the local cluster Raise LibraryError on any error. env LibraryEnvironment nodes list -- list of dicts which represents node. Supported keys are: name (required), addrs (list), devices (list), watchdog wait -- specifies if command should try to wait for cluster to start up. Has no effect start is False. If set to False command will not wait for cluster to start. If None command will wait for some default timeout. If int wait set timeout to int value of seconds. start bool -- if True start cluster when it is set up enable bool -- if True enable cluster when it is set up force bool -- if True some validations errors are treated as warnings force_unresolvable bool -- if True not resolvable addresses of nodes are treated as warnings skip_offline_nodes bool -- if True non fatal connection failures to other hosts are treated as warnings no_watchdog_validation bool -- if True do not validate specified watchdogs on remote hosts """ _ensure_live_env(env) # raises if env is not live report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() is_sbd_enabled = sbd.is_sbd_enabled(env.cmd_runner()) corosync_conf = env.get_corosync_conf() cluster_nodes_names = corosync_conf.get_nodes_names() corosync_node_options = {"name", "addrs"} sbd_node_options = {"devices", "watchdog"} keys_to_normalize = {"addrs"} if is_sbd_enabled: keys_to_normalize |= sbd_node_options new_nodes = [_normalize_dict(node, keys_to_normalize) for node in nodes] # get targets for existing nodes target_report_list, cluster_nodes_target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline_nodes, )) report_processor.report_list(target_report_list) # get a target for qnetd if needed qdevice_model, qdevice_model_options, _, _ = ( corosync_conf.get_quorum_device_settings()) if qdevice_model == "net": try: qnetd_target = target_factory.get_target( qdevice_model_options["host"]) except HostNotFound: report_processor.report( reports.host_not_found([qdevice_model_options["host"]])) # Get targets for new nodes and report unknown (== not-authorized) nodes. # If a node doesn't contain the 'name' key, validation of inputs reports it. # That means we don't report missing names but cannot rely on them being # present either. target_report_list, new_nodes_target_list = ( target_factory.get_target_list_with_reports( [node["name"] for node in new_nodes if "name" in node], allow_skip=False, )) report_processor.report_list(target_report_list) # Set default values for not-specified node options. # Use an address defined in known-hosts for each node with no addresses # specified. This allows users not to specify node addresses at all which # simplifies the whole node add command / form significantly. new_nodes_target_dict = { target.label: target for target in new_nodes_target_list } addrs_defaulter = _get_addrs_defaulter(report_processor, new_nodes_target_dict) new_nodes_defaulters = {"addrs": addrs_defaulter} if is_sbd_enabled: watchdog_defaulter = _get_watchdog_defaulter(report_processor, new_nodes_target_dict) new_nodes_defaulters["devices"] = lambda _: [] new_nodes_defaulters["watchdog"] = watchdog_defaulter new_nodes = [ _set_defaults_in_dict(node, new_nodes_defaulters) for node in new_nodes ] new_nodes_dict = { node["name"]: node for node in new_nodes if "name" in node } # Validate inputs - node options names # We do not want to make corosync validators know about SBD options and # vice versa. Therefore corosync and SBD validators get only valid corosync # and SBD options respectively, and we need to check for any surplus # options here. report_processor.report_list( validate_names_in( corosync_node_options | sbd_node_options, set([ option for node_options in [node.keys() for node in new_nodes] for option in node_options ]), option_type="node")) # Validate inputs - corosync part try: cib = env.get_cib() cib_nodes = get_remote_nodes(cib) + get_guest_nodes(cib) except LibraryError: cib_nodes = [] report_processor.report( reports.get_problem_creator( report_codes.FORCE_LOAD_NODES_FROM_CIB, force)(reports.cib_load_error_get_nodes_for_validation)) # corosync validator rejects non-corosync keys new_nodes_corosync = [{ key: node[key] for key in corosync_node_options if key in node } for node in new_nodes] report_processor.report_list( config_validators.add_nodes(new_nodes_corosync, corosync_conf.get_nodes(), cib_nodes, force_unresolvable=force_unresolvable)) # Validate inputs - SBD part if is_sbd_enabled: report_processor.report_list( sbd.validate_new_nodes_devices({ node["name"]: node["devices"] for node in new_nodes if "name" in node })) else: for node in new_nodes: sbd_options = sbd_node_options.intersection(node.keys()) if sbd_options and "name" in node: report_processor.report( reports.sbd_not_used_cannot_set_sbd_options( sbd_options, node["name"])) # Validate inputs - flags part wait_timeout = _get_validated_wait_timeout(report_processor, wait, start) # Get online cluster nodes # This is the only call in which we accept skip_offline_nodes option for the # cluster nodes. In all the other actions we communicate only with the # online nodes. This allows us to simplify code as any communication issue # is considered an error, ends the command processing and is not possible # to skip it by skip_offline_nodes. We do not have to care about a situation # when a communication command cannot connect to some nodes and then the # next command can connect but fails due to the previous one did not # succeed. online_cluster_target_list = [] if cluster_nodes_target_list: com_cmd = GetOnlineTargets( report_processor, ignore_offline_targets=skip_offline_nodes, ) com_cmd.set_targets(cluster_nodes_target_list) online_cluster_target_list = run_com(env.get_node_communicator(), com_cmd) offline_cluster_target_list = [ target for target in cluster_nodes_target_list if target not in online_cluster_target_list ] if len(online_cluster_target_list) == 0: report_processor.report( reports.unable_to_perform_operation_on_any_node()) elif offline_cluster_target_list and skip_offline_nodes: # TODO: report (warn) how to fix offline nodes when they come online # report_processor.report(None) pass # Validate existing cluster nodes status atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(), corosync_conf, len(new_nodes)) if atb_has_to_be_enabled: report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) if online_cluster_target_list: com_cmd = CheckCorosyncOffline( report_processor, allow_skip_offline=False, ) com_cmd.set_targets(online_cluster_target_list) run_com(env.get_node_communicator(), com_cmd) # Validate new nodes. All new nodes have to be online. com_cmd = GetHostInfo(report_processor) com_cmd.set_targets(new_nodes_target_list) report_processor.report_list( _host_check_cluster_setup( run_com(env.get_node_communicator(), com_cmd), force, # version of services may not be the same across the existing # cluster nodes, so it's not easy to make this check properly check_services_versions=False, )) # Validate SBD on new nodes if is_sbd_enabled: if no_watchdog_validation: report_processor.report(reports.sbd_watchdog_validation_inactive()) com_cmd = CheckSbd(report_processor) for new_node_target in new_nodes_target_list: new_node = new_nodes_dict[new_node_target.label] # Do not send watchdog if validation is turned off. Listing of # available watchdogs in pcsd may restart the machine in some # corner cases. com_cmd.add_request( new_node_target, watchdog="" if no_watchdog_validation else new_node["watchdog"], device_list=new_node["devices"], ) run_com(env.get_node_communicator(), com_cmd) if report_processor.has_errors: raise LibraryError() # Validation done. If errors occured, an exception has been raised and we # don't get below this line. # First set up everything else than corosync. Once the new nodes are present # in corosync.conf, they're considered part of a cluster and the node add # command cannot be run again. So we need to minimize the amout of actions # (and therefore possible failures) after adding the nodes to corosync. # distribute auth tokens of all cluster nodes (including the new ones) to # all new nodes com_cmd = UpdateKnownHosts( env.report_processor, known_hosts_to_add=env.get_known_hosts(cluster_nodes_names + list(new_nodes_dict.keys())), known_hosts_to_remove=[], ) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # qdevice setup if qdevice_model == "net": qdevice_net.set_up_client_certificates( env.cmd_runner(), env.report_processor, env.communicator_factory, qnetd_target, corosync_conf.get_cluster_name(), new_nodes_target_list, # we don't want to allow skiping offline nodes which are being # added, otherwise qdevice will not work properly skip_offline_nodes=False, allow_skip_offline=False) # sbd setup if is_sbd_enabled: sbd_cfg = environment_file_to_dict(sbd.get_local_sbd_config()) com_cmd = SetSbdConfig(env.report_processor) for new_node_target in new_nodes_target_list: new_node = new_nodes_dict[new_node_target.label] com_cmd.add_request( new_node_target, sbd.create_sbd_config( sbd_cfg, new_node["name"], watchdog=new_node["watchdog"], device_list=new_node["devices"], )) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = EnableSbdService(env.report_processor) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) else: com_cmd = DisableSbdService(env.report_processor) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # booth setup booth_sync.send_all_config_to_node( env.get_node_communicator(), env.report_processor, new_nodes_target_list, rewrite_existing=force, skip_wrong_config=force, ) # distribute corosync and pacemaker authkeys files_action = {} forceable_io_error_creator = reports.get_problem_creator( report_codes.SKIP_FILE_DISTRIBUTION_ERRORS, force) if os.path.isfile(settings.corosync_authkey_file): try: files_action.update( node_communication_format.corosync_authkey_file( open(settings.corosync_authkey_file, "rb").read())) except EnvironmentError as e: report_processor.report( forceable_io_error_creator( reports.file_io_error, env_file_role_codes.COROSYNC_AUTHKEY, file_path=settings.corosync_authkey_file, operation="read", reason=format_environment_error(e))) if os.path.isfile(settings.pacemaker_authkey_file): try: files_action.update( node_communication_format.pcmk_authkey_file( open(settings.pacemaker_authkey_file, "rb").read())) except EnvironmentError as e: report_processor.report( forceable_io_error_creator( reports.file_io_error, env_file_role_codes.PACEMAKER_AUTHKEY, file_path=settings.pacemaker_authkey_file, operation="read", reason=format_environment_error(e))) # pcs_settings.conf was previously synced using pcsdcli send_local_configs. # This has been changed temporarily until new system for distribution and # syncronization of configs will be introduced. if os.path.isfile(settings.pcsd_settings_conf_location): try: files_action.update( node_communication_format.pcs_settings_conf_file( open(settings.pcsd_settings_conf_location, "r").read())) except EnvironmentError as e: report_processor.report( forceable_io_error_creator( reports.file_io_error, env_file_role_codes.PCS_SETTINGS_CONF, file_path=settings.pcsd_settings_conf_location, operation="read", reason=format_environment_error(e))) # stop here if one of the files could not be loaded and it was not forced if report_processor.has_errors: raise LibraryError() if files_action: com_cmd = DistributeFilesWithoutForces(env.report_processor, files_action) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Distribute and reload pcsd SSL certificate report_processor.report( reports.pcsd_ssl_cert_and_key_distribution_started( [target.label for target in new_nodes_target_list])) try: with open(settings.pcsd_cert_location, "r") as f: ssl_cert = f.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_CERT, file_path=settings.pcsd_cert_location, reason=format_environment_error(e), operation="read", )) try: with open(settings.pcsd_key_location, "r") as f: ssl_key = f.read() except EnvironmentError as e: report_processor.report( reports.file_io_error( env_file_role_codes.PCSD_SSL_KEY, file_path=settings.pcsd_key_location, reason=format_environment_error(e), operation="read", )) if report_processor.has_errors: raise LibraryError() com_cmd = SendPcsdSslCertAndKey(env.report_processor, ssl_cert, ssl_key) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # When corosync >= 2 is in use, the procedure for adding a node is: # 1. add the new node to corosync.conf on all existing nodes # 2. reload corosync.conf before the new node is started # 3. start the new node # If done otherwise, membership gets broken and qdevice hangs. Cluster # will recover after a minute or so but still it's a wrong way. corosync_conf.add_nodes(new_nodes_corosync) if atb_has_to_be_enabled: corosync_conf.set_quorum_options(dict(auto_tie_breaker="1")) com_cmd = DistributeCorosyncConf( env.report_processor, corosync_conf.config.export(), allow_skip_offline=False, ) com_cmd.set_targets(online_cluster_target_list + new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = ReloadCorosyncConf(env.report_processor) com_cmd.set_targets(online_cluster_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # Optionally enable and start cluster services. if enable: com_cmd = EnableCluster(env.report_processor) com_cmd.set_targets(new_nodes_target_list) run_and_raise(env.get_node_communicator(), com_cmd) if start: _start_cluster( env.communicator_factory, env.report_processor, new_nodes_target_list, wait_timeout=wait_timeout, )
def set_recovery_site(env: LibraryEnvironment, node_name: str) -> None: """ Set up disaster recovery with the local cluster being the primary site env node_name -- a known host from the recovery site """ if env.ghost_file_codes: raise LibraryError( reports.live_environment_required(env.ghost_file_codes)) report_processor = SimpleReportProcessor(env.report_processor) dr_env = env.get_dr_env() if dr_env.config.raw_file.exists(): report_processor.report(reports.dr_config_already_exist()) target_factory = env.get_node_target_factory() local_nodes, report_list = get_existing_nodes_names( env.get_corosync_conf(), error_on_missing_name=True) report_processor.report_list(report_list) if node_name in local_nodes: report_processor.report(reports.node_in_local_cluster(node_name)) report_list, local_targets = target_factory.get_target_list_with_reports( local_nodes, allow_skip=False, report_none_host_found=False) report_processor.report_list(report_list) report_list, remote_targets = (target_factory.get_target_list_with_reports( [node_name], allow_skip=False, report_none_host_found=False)) report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() com_cmd = GetCorosyncConf(env.report_processor) com_cmd.set_targets(remote_targets) remote_cluster_nodes, report_list = get_existing_nodes_names( CorosyncConfigFacade.from_string( run_and_raise(env.get_node_communicator(), com_cmd)), error_on_missing_name=True) if report_processor.report_list(report_list): raise LibraryError() # ensure we have tokens for all nodes of remote cluster report_list, remote_targets = target_factory.get_target_list_with_reports( remote_cluster_nodes, allow_skip=False, report_none_host_found=False) if report_processor.report_list(report_list): raise LibraryError() dr_config_exporter = (get_file_toolbox( file_type_codes.PCS_DR_CONFIG).exporter) # create dr config for remote cluster remote_dr_cfg = dr_env.create_facade(DrRole.RECOVERY) remote_dr_cfg.add_site(DrRole.PRIMARY, local_nodes) # send config to all node of remote cluster distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(remote_dr_cfg.config))) distribute_file_cmd.set_targets(remote_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd) # create new dr config, with local cluster as primary site local_dr_cfg = dr_env.create_facade(DrRole.PRIMARY) local_dr_cfg.add_site(DrRole.RECOVERY, remote_cluster_nodes) distribute_file_cmd = DistributeFilesWithoutForces( env.report_processor, node_communication_format.pcs_dr_config_file( dr_config_exporter.export(local_dr_cfg.config))) distribute_file_cmd.set_targets(local_targets) run_and_raise(env.get_node_communicator(), distribute_file_cmd)
def node_add_remote( env, node_name, node_addr, operations, meta_attributes, instance_attributes, skip_offline_nodes=False, allow_incomplete_distribution=False, allow_pacemaker_remote_service_fail=False, allow_invalid_operation=False, allow_invalid_instance_attributes=False, use_default_operations=True, wait=False, ): # pylint: disable=too-many-arguments, too-many-branches, too-many-locals """ create an ocf:pacemaker:remote resource and use it as a remote node LibraryEnvironment env -- provides all for communication with externals string node_name -- the name of the new node mixed node_addr -- the address of the new node or None for default list of dict operations -- attributes for each entered operation dict meta_attributes -- attributes for primitive/meta_attributes dict instance_attributes -- attributes for primitive/instance_attributes bool skip_offline_nodes -- if True, ignore when some nodes are offline bool allow_incomplete_distribution -- if True, allow this command to finish successfully even if file distribution did not succeed bool allow_pacemaker_remote_service_fail -- if True, allow this command to finish successfully even if starting/enabling pacemaker_remote did not succeed bool allow_invalid_operation -- if True, allow to use operations that are not listed in a resource agent metadata bool allow_invalid_instance_attributes -- if True, allow to use instance attributes that are not listed in a resource agent metadata and allow to omit required instance_attributes bool use_default_operations -- if True, add operations specified in a resource agent metadata to the resource mixed wait -- a flag for controlling waiting for pacemaker idle mechanism """ env.ensure_wait_satisfiable(wait) report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() cib = env.get_cib() id_provider = IdProvider(cib) if env.is_cib_live: corosync_conf = env.get_corosync_conf() else: corosync_conf = None report_processor.report( reports.corosync_node_conflict_check_skipped("not_live_cib") ) existing_nodes_names, existing_nodes_addrs, report_list = ( get_existing_nodes_names_addrs(corosync_conf, cib) ) if env.is_cib_live: # We just reported corosync checks are going to be skipped so we # shouldn't complain about errors related to corosync nodes report_processor.report_list(report_list) resource_agent = remote_node.get_agent( env.report_processor, env.cmd_runner() ) existing_target_list = [] if env.is_cib_live: existing_target_list, new_target_list = _get_targets_for_add( target_factory, report_processor, existing_nodes_names, [node_name], skip_offline_nodes ) new_target = new_target_list[0] if new_target_list else None # default node_addr to an address from known-hosts if node_addr is None: node_addr = new_target.first_addr if new_target else node_name report_processor.report( reports.using_known_host_address_for_host(node_name, node_addr) ) else: # default node_addr to an address from known-hosts if node_addr is None: known_hosts = env.get_known_hosts([node_name]) node_addr = known_hosts[0].dest.addr if known_hosts else node_name report_processor.report( reports.using_known_host_address_for_host(node_name, node_addr) ) # validate inputs report_list = remote_node.validate_create( existing_nodes_names, existing_nodes_addrs, resource_agent, node_name, node_addr, instance_attributes ) # validation + cib setup # TODO extract the validation to a separate function try: remote_resource_element = remote_node.create( env.report_processor, resource_agent, get_resources(cib), id_provider, node_addr, node_name, operations, meta_attributes, instance_attributes, allow_invalid_operation, allow_invalid_instance_attributes, use_default_operations, ) except LibraryError as e: #Check unique id conflict with check against nodes. Until validation #resource create is not separated, we need to make unique post #validation. already_exists = [] unified_report_list = [] for report in report_list + list(e.args): if report.code not in ( report_codes.ID_ALREADY_EXISTS, report_codes.RESOURCE_INSTANCE_ATTR_VALUE_NOT_UNIQUE, ): unified_report_list.append(report) elif ( "id" in report.info and report.info["id"] not in already_exists ): unified_report_list.append(report) already_exists.append(report.info["id"]) report_list = unified_report_list report_processor.report_list(report_list) if report_processor.has_errors: raise LibraryError() # everything validated, let's set it up if env.is_cib_live: _prepare_pacemaker_remote_environment( env, report_processor, existing_target_list, new_target, node_name, skip_offline_nodes, allow_incomplete_distribution, allow_pacemaker_remote_service_fail, ) else: report_processor.report_list( _reports_skip_new_node(node_name, "not_live_cib") ) env.push_cib(wait=wait) if wait: _ensure_resource_running(env, remote_resource_element.attrib["id"])