def test_qdevice_tie_breaker_none(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node4"], self.fixture_nodes, ("net", {}, None, None) ), [], )
def test_all_nodes(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node3", "node1", "node2", "node4"], self.fixture_nodes, (None, None, None, None), ), [fixture.error(report_codes.CANNOT_REMOVE_ALL_CLUSTER_NODES)], )
def test_qdevice_tie_breaker_generic(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node4"], self.fixture_nodes, ("net", {"tie_breaker": "highest"}, None, None), ), [], )
def test_nonexisting_nodes(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node3", "nodeX", "nodeY", "node4"], self.fixture_nodes, (None, None, None, None)), [ fixture.error( report_codes.NODE_NOT_FOUND, node=node, searched_types=[]) for node in ["nodeX", "nodeY"] ])
def test_qdevice_tie_breaker_kept(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node4"], self.fixture_nodes, ("net", {"tie_breaker": "3"}, None, None) ), [ ] )
def test_qdevice_tie_breaker_removed(self): assert_report_item_list_equal( config_validators.remove_nodes(["node4"], self.fixture_nodes, ("net", { "tie_breaker": "4" }, None, None)), [ fixture.error(report_codes.NODE_USED_AS_TIE_BREAKER, node="node4", node_id=4), ])
def test_all_nodes(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node3", "node1", "node2", "node4"], self.fixture_nodes, (None, None, None, None) ), [ fixture.error(report_codes.CANNOT_REMOVE_ALL_CLUSTER_NODES) ] )
def test_qdevice_tie_breaker_removed(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node4"], self.fixture_nodes, ("net", {"tie_breaker": "4"}, None, None) ), [ fixture.error( report_codes.NODE_USED_AS_TIE_BREAKER, node="node4", node_id=4 ), ] )
def test_more_errors(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node3", "node1", "node2", "node4", "nodeX"], self.fixture_nodes, ("net", { "tie_breaker": "4" }, None, None)), [ fixture.error(report_codes.NODE_NOT_FOUND, node="nodeX", searched_types=[]), fixture.error(report_codes.CANNOT_REMOVE_ALL_CLUSTER_NODES), fixture.error(report_codes.NODE_USED_AS_TIE_BREAKER, node="node4", node_id=4), ])
def test_nonexisting_nodes(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node3", "nodeX", "nodeY", "node4"], self.fixture_nodes, (None, None, None, None) ), [ fixture.error( report_codes.NODE_NOT_FOUND, node=node, searched_types=[] ) for node in ["nodeX", "nodeY"] ] )
def test_more_errors(self): assert_report_item_list_equal( config_validators.remove_nodes( ["node3", "node1", "node2", "node4", "nodeX"], self.fixture_nodes, ("net", {"tie_breaker": "4"}, None, None) ), [ fixture.error( report_codes.NODE_NOT_FOUND, node="nodeX", searched_types=[] ), fixture.error(report_codes.CANNOT_REMOVE_ALL_CLUSTER_NODES), fixture.error( report_codes.NODE_USED_AS_TIE_BREAKER, node="node4", node_id=4 ), ] )
def remove_nodes(env, node_list, force_quorum_loss=False, skip_offline=False): """ Remove nodes from a cluster. env LibraryEnvironment node_list iterable -- names of nodes to remove force_quorum_loss bool -- treat quorum loss as a warning if True skip_offline bool -- treat unreachable nodes as warnings if True """ _ensure_live_env(env) # raises if env is not live report_processor = SimpleReportProcessor(env.report_processor) target_factory = env.get_node_target_factory() corosync_conf = env.get_corosync_conf() cluster_nodes_names = corosync_conf.get_nodes_names() # validations report_processor.report_list( config_validators.remove_nodes( node_list, corosync_conf.get_nodes(), corosync_conf.get_quorum_device_settings(), )) if report_processor.has_errors: # If there is an error, there is usually not much sense in doing other # validations: # - if there would be no node left in the cluster, it's pointless # to check for quorum loss or if at least one remaining node is online # - if only one node is being removed and it doesn't exist, it's again # pointless to check for other issues raise LibraryError() target_report_list, cluster_nodes_target_list = ( target_factory.get_target_list_with_reports( cluster_nodes_names, skip_non_existing=skip_offline, )) known_nodes = set([target.label for target in cluster_nodes_target_list]) unknown_nodes = set( [name for name in cluster_nodes_names if name not in known_nodes]) report_processor.report_list(target_report_list) com_cmd = GetOnlineTargets( report_processor, ignore_offline_targets=skip_offline, ) com_cmd.set_targets(cluster_nodes_target_list) online_target_list = run_com(env.get_node_communicator(), com_cmd) offline_target_list = [ target for target in cluster_nodes_target_list if target not in online_target_list ] staying_online_target_list = [ target for target in online_target_list if target.label not in node_list ] targets_to_remove = [ target for target in cluster_nodes_target_list if target.label in node_list ] if not staying_online_target_list: report_processor.report( reports.unable_to_connect_to_any_remaining_node()) # If no remaining node is online, there is no point in checking quorum # loss or anything as we would just get errors. raise LibraryError() if skip_offline: staying_offline_nodes = ([ target.label for target in offline_target_list if target.label not in node_list ] + [name for name in unknown_nodes if name not in node_list]) if staying_offline_nodes: report_processor.report( reports.unable_to_connect_to_all_remaining_node( staying_offline_nodes)) atb_has_to_be_enabled = sbd.atb_has_to_be_enabled(env.cmd_runner(), corosync_conf, -len(node_list)) if atb_has_to_be_enabled: report_processor.report( reports.corosync_quorum_atb_will_be_enabled_due_to_sbd()) com_cmd = CheckCorosyncOffline( report_processor, allow_skip_offline=False, ) com_cmd.set_targets(staying_online_target_list) run_com(env.get_node_communicator(), com_cmd) else: # Check if removing the nodes would cause quorum loss. We ask the nodes # to be removed for their view of quorum. If they are all stopped or # not in a quorate partition, their removal cannot cause quorum loss. # That's why we ask them and not the remaining nodes. # example: 5-node cluster, 3 online nodes, removing one online node, # results in 4-node cluster with 2 online nodes => quorum lost # Check quorum loss only if ATB does not need to be enabled. If it is # required, cluster has to be turned off and therefore it loses quorum. forceable_report_creator = reports.get_problem_creator( report_codes.FORCE_QUORUM_LOSS, force_quorum_loss) com_cmd = cluster.GetQuorumStatus(report_processor) com_cmd.set_targets(targets_to_remove) failures, quorum_status = run_com(env.get_node_communicator(), com_cmd) if quorum_status: if quorum_status.stopping_nodes_cause_quorum_loss(node_list): report_processor.report( forceable_report_creator( reports.corosync_quorum_will_be_lost)) elif failures or not targets_to_remove: report_processor.report( forceable_report_creator( reports.corosync_quorum_loss_unable_to_check, )) if report_processor.has_errors: raise LibraryError() # validations done unknown_to_remove = [name for name in unknown_nodes if name in node_list] if unknown_to_remove: report_processor.report( reports.nodes_to_remove_unreachable(unknown_to_remove)) if targets_to_remove: com_cmd = cluster.DestroyWarnOnFailure(report_processor) com_cmd.set_targets(targets_to_remove) run_and_raise(env.get_node_communicator(), com_cmd) corosync_conf.remove_nodes(node_list) if atb_has_to_be_enabled: corosync_conf.set_quorum_options(dict(auto_tie_breaker="1")) com_cmd = DistributeCorosyncConf( env.report_processor, corosync_conf.config.export(), allow_skip_offline=False, ) com_cmd.set_targets(staying_online_target_list) run_and_raise(env.get_node_communicator(), com_cmd) com_cmd = ReloadCorosyncConf(env.report_processor) com_cmd.set_targets(staying_online_target_list) run_and_raise(env.get_node_communicator(), com_cmd) # try to remove nodes from pcmk using crm_node -R <node> --force and if not # successful remove it directly from CIB file on all nodes in parallel com_cmd = RemoveNodesFromCib(env.report_processor, node_list) com_cmd.set_targets(staying_online_target_list) run_and_raise(env.get_node_communicator(), com_cmd)