def remove_rule(self, resource: str, event: HEALTH_STATUSES, action: HEALTH_MON_ACTIONS): """ For the rule resource/event remove "action" from confstore. If actions list becomes empty, delete the rule Args: resource(str): resource name event(str): event type action(str): action to be removed """ self._validate_action(action) key = self._prepare_key(resource, event) val = [] Log.info(f"Removing rule for key: {key} ,value: {action}") kv = self._get_val(key) if kv: _, val = self._get_k_v(kv) if action not in val: Log.warn(f"KV not found for key: {key}, value: {action}") else: val.remove(action) if len(val) == 0: self._confstore.delete(key) Log.debug( f"key value removed for {key} , {action}. value list empty; deleting key {key}" ) else: val = json.dumps(val) self._confstore.update(key, val) Log.debug(f"KV removed for {key} , {action}") else: Log.warn(f"key {key} not found")
def set_cluster_cardinality(self, index): """ Set number of nodes(pods) and their machine ids in confstore used by HA """ data_pods = ConftStoreSearch.get_data_pods(index) server_pods = ConftStoreSearch.get_server_pods(index) control_pods = ConftStoreSearch.get_control_nodes(index) # Combine the lists data_pods, server_pod, control_pods and find unique machine ids watch_pods = data_pods + server_pods + control_pods watch_pods = list(set(watch_pods)) num_pods = len(watch_pods) Log.info( f"Cluster cardinality: number of nodes {num_pods}, machine ids for nodes {watch_pods} " ) if num_pods == 0: Log.warn( f"Possible cluster cardinality issue; number of pods to be watched {num_pods}" ) # Update the same to consul; if KV already present, it will be modified. cluster_cardinality_key = CLUSTER_CARDINALITY_KEY cluster_cardinality_value = { CLUSTER_CARDINALITY_NUM_NODES: num_pods, CLUSTER_CARDINALITY_LIST_NODES: watch_pods } self._confstore.update(cluster_cardinality_key, json.dumps(cluster_cardinality_value))
def add_rule(self, resource: str, event: HEALTH_STATUSES, action: HEALTH_MON_ACTIONS): """ Add rule to confstore for resource/event. If rule exists, append the "action" to same rule Args: resource(str): resource name event(str): event type action(str): action to be added """ self._validate_action(action) key = self._prepare_key(resource, event) val = [] Log.info(f"Adding rule for key: {key} ,value: {action}") kv = self._get_val(key) if kv: _, val = self._get_k_v(kv) if action not in val: val.append(action) val = json.dumps(val) self._confstore.update(key, val) else: Log.warn(f"key value already exists for {key} , {action}") return else: val.append(action) val = json.dumps(val) self._confstore.set(key, val)
def _parse_response(self, msg) -> bool: """ Check if enclosure shoutdown was successful or not Response with following severity expected in json response "severity": "informational" : Successful shutdown "severity": "warning" : Failure in shutdown Args: msg : response Return: True : Enclosure shutdown was successful False : Enclosure shutdown failed """ message = json.loads(msg).get(ACTUATOR_ATTRIBUTES.MESSAGE) severity = message.get(ACTUATOR_ATTRIBUTES.ACTUATOR_RESPONSE_TYPE).get( ACTUATOR_ATTRIBUTES.SEVERITY) if severity == EVENT_SEVERITIES.INFORMATIONAL.value: return True elif severity == EVENT_SEVERITIES.WARNING.value: return False else: Log.warn( f"Actuator response received with unexpected status {msg}") return False
async def get_bundle_status(command): """ Initializes the process for Displaying the Status for Support Bundle. :param command: Csm_cli Command Object :type: command :return: None """ try: bundle_id = command.options.get(const.SB_BUNDLE_ID) conf = GeneralConfig(database.DATABASE) db = DataBaseProvider(conf) repo = SupportBundleRepository(db) all_nodes_status = await repo.retrieve_all(bundle_id) response = { "status": [ each_status.to_primitive() for each_status in all_nodes_status ] } return Response(output=response, rc=OPERATION_SUCESSFUL) except DataAccessExternalError as e: Log.warn(f"Failed to connect to elasticsearch: {e}") return Response( output=("Support Bundle status is not available currently" " as required services are not running." " Please wait and check the /tmp/support_bundle" " folder for newly generated support bundle."), rc=str(errno.ECONNREFUSED)) except Exception as e: Log.error(f"Failed to get bundle status: {e}") return Response( output=("Support Bundle status is not available currently" " as required services are not running." " Failed to get status of bundle."), rc=str(errno.ENOENT))
def _get_cvg_count(index, node_id): cvg_count = Conf.get( index, GconfKeys.CVG_COUNT.value.format(_DELIM=_DELIM, node_id=node_id)) if not cvg_count: Log.warn(f"CVGs are not available for this node {node_id}") return cvg_count
def start(self): """ Start to listen messages. """ if self._consumer is not None: self._consumer.start() else: Log.warn(f"Consumer not found for message type {self._message_type}.")
def stop(self, flush=False): """ stop to listen messages. """ if self._consumer is not None: self._consumer.stop(flush=flush) else: Log.warn(f"Consumer not found for message type {self._message_type}.")
def _configure_rsyslog(): """Restart rsyslog service for reflecting supportbundle rsyslog config.""" try: Log.info("Restarting rsyslog service") service_obj = Service("rsyslog.service") service_obj.restart() except Exception as e: Log.warn(f"Error in rsyslog service restart: {e}")
def __init__(self, wait_time=10): """ Init method Create monitor objects and Sets the callbacks to sigterm """ try: # set sigterm handler signal.signal(signal.SIGTERM, self.set_sigterm) # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore # provisioner needs to be informed to add it in confstore (to be added there ) ConfigManager.init("k8s_resource_monitor") _conf_stor_search = ConftStoreSearch() self.monitors = [] # event output in pretty format kwargs = {K8SClientConst.PRETTY: True} # Seting a timeout value, 'timout_seconds', for the stream. # timeout value for connection to the server # If do not set then we will not able to stop immediately, # becuase synchronus function watch.stream() will not come back # until catch any event on which it is waiting. kwargs[K8SClientConst. TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT # Get MessageBus producer object for all monitor threads producer = self._get_producer() # Change to multiprocessing # Creating NODE monitor object node_monitor = ObjectMonitor(producer, K8SClientConst.NODE, **kwargs) self.monitors.append(node_monitor) _, nodes_list = _conf_stor_search.get_cluster_cardinality() if not nodes_list: Log.warn( f"No nodes in the cluster to watch for nodes_list: {nodes_list}" ) else: Log.info(f"Starting watch for: nodes_list: {nodes_list}") watcher_node_ids = ', '.join(node_id for node_id in nodes_list) kwargs[ K8SClientConst. LABEL_SELECTOR] = f'cortx.io/machine-id in ({watcher_node_ids})' # Creating POD monitor object pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs) self.monitors.append(pod_monitor) except Exception as err: Log.error(f'Monitor failed to start watchers: {err}')
def _is_cluster_standby_on() -> None: '''Check if cluster is in standby mode. If not, make standby mode ON''' Log.info('Check cluster is in standby mode') value = SimpleCommand().run_cmd(CHECK_PCS_STANDBY_MODE) standby_value = value[0].split(' ')[3].strip('\n').split('=') if standby_value[1].lower() != 'on': Log.warn('cluster is not in standby mode.') Log.info('switching the cluster in standby mode for performing post upgrade routines') _switch_cluster_mode(PCS_CLUSTER_STANDBY) Log.info('#### All post-upgrade prerequisites are in place ####')
def reset(self): """ Performs reset. Raises exception on error """ # Check service status service_obj = Service('elasticsearch.service') service_state = service_obj.get_state() if service_state._state == 'active': Log.warn("Elasticsearch service in active state. \n" "Stopping Elasticsearch service now...") service_obj.stop() # Clear log files. Elasticsearch.truncate_log_files(self.log_path) Log.info("Reset done.") return 0
def enable_stonith(self): """ Enable stonith for HW Returns: """ # enable the stonith here env_type = Conf.get(const.HA_GLOBAL_INDEX, f"CLUSTER_MANAGER{const._DELIM}env") if env_type.lower() == const.INSTALLATION_TYPE.HW.value.lower(): Log.info("Enabling the stonith.") self._execute.run_cmd(const.PCS_STONITH_ENABLE) Log.info("Stonith enabled successfully.") else: Log.warn(f"Stonith is not enabled, detected {env_type} env")
def mgmt_vip(cib_xml, push=False, **kwargs): """Create mgmt Virtual IP resource.""" if "mgmt_info" not in kwargs.keys() or len(kwargs["mgmt_info"]) == 0: Log.warn("Management VIP is not detected in current configuration.") else: mgmt_info = kwargs["mgmt_info"] output, err, rc = process.run_cmd(f"pcs -f {cib_xml} resource create mgmt-vip ocf:heartbeat:IPaddr2 \ ip={mgmt_info['mgmt_vip']} cidr_netmask={mgmt_info['mgmt_netmask']} nic={mgmt_info['mgmt_iface']} iflabel=mgmt_vip \ op start timeout=60s interval=0s \ op monitor timeout=30s interval=30s \ op stop timeout=60s interval=0s --group management_group", check_error=False) if rc != 0: raise CreateResourceError(f"Mgmt vip creation failed, mgmt info: {mgmt_info}, Err: {err}") if push: cib_push(cib_xml)
def init(self): """ Initialize the object usinf configuration params passed. Establish connection with Kafka broker. """ self._channel = None retry_count = 0 try: while self._channel is None and int( self._retry_counter) > retry_count: self.connect() if self._channel is None: Log.warn(f"message bus producer connection Failed. Retry Attempt: {retry_count+1}" \ f" in {2**retry_count} seconds") time.sleep(2**retry_count) retry_count += 1 else: Log.debug(f"message bus producer connection is Initialized."\ f"Attempts:{retry_count+1}") except Exception as ex: Log.error(f"message bus producer initialization failed. {ex}") raise ConnectionEstError( f"Unable to connect to message bus broker. {ex}")
def mgmt_vip(cib_xml, push=False, **kwargs): """Create mgmt Virtual IP resource.""" mgmt_vip_start = str( get_res_timeout(RESOURCE.MGMT_VIP.value, TIMEOUT_ACTION.START.value)) mgmt_vip_stop = str( get_res_timeout(RESOURCE.MGMT_VIP.value, TIMEOUT_ACTION.STOP.value)) vip_health_start = str( get_res_timeout(RESOURCE.VIP_HEALTH_MONITOR.value, TIMEOUT_ACTION.START.value)) vip_health_stop = str( get_res_timeout(RESOURCE.VIP_HEALTH_MONITOR.value, TIMEOUT_ACTION.STOP.value)) if "mgmt_info" not in kwargs.keys() or len(kwargs["mgmt_info"]) == 0: Log.warn("Management VIP is not detected in current configuration.") else: mgmt_info = kwargs["mgmt_info"] output, err, rc0 = process.run_cmd( f"pcs -f {cib_xml} resource create {RESOURCE.VIP_HEALTH_MONITOR.value} ocf:seagate:vip_health_monitor \ vip={mgmt_info['mgmt_vip']} nic={mgmt_info['mgmt_iface']} \ op start timeout={vip_health_start}s interval=0s \ op monitor timeout=29s interval=30s \ op stop timeout={vip_health_stop}s interval=0s --group management_group", check_error=False) output, err, rc1 = process.run_cmd( f"pcs -f {cib_xml} resource create {RESOURCE.MGMT_VIP.value} ocf:heartbeat:IPaddr2 \ ip={mgmt_info['mgmt_vip']} cidr_netmask={mgmt_info['mgmt_netmask']} nic={mgmt_info['mgmt_iface']} iflabel=mgmt_vip \ op start timeout={mgmt_vip_start}s interval=0s \ op monitor timeout=29s interval=30s \ op stop timeout={mgmt_vip_stop}s interval=0s --group management_group", check_error=False) if rc0 != 0 or rc1 != 0: raise CreateResourceError( f"Mgmt vip creation failed, mgmt info: {mgmt_info}, Err: {err}" ) if push: cib_push(cib_xml)
def process(self): """ Process cleanup command. """ Log.info("Processing cleanup command") try: nodes = self._confstore.get(const.CLUSTER_CONFSTORE_NODES_KEY) node_count: int = 0 if nodes is None else len(nodes) node_name = self.get_node_name() # Standby # TODO: handle multiple case for standby EOS-20855 standby_output: str = self._cluster_manager.node_controller.standby( node_name) if json.loads(standby_output).get( "status") == STATUSES.FAILED.value: Log.warn( f"Standby for {node_name} failed with output: {standby_output}." "Cluster will be destroyed forcefully") if CleanupCmd.LOCAL_CHECK and node_count > 1: # TODO: Update cluster kill for --local option also # Remove SSH self._remove_node(node_name) else: # Destroy self._destroy_cluster(node_name) if self._confstore.key_exists( f"{const.CLUSTER_CONFSTORE_NODES_KEY}/{node_name}"): self._confstore.delete( f"{const.CLUSTER_CONFSTORE_NODES_KEY}/{node_name}") # Delete the config file self.remove_config_files() except Exception as e: Log.error(f"Cluster cleanup command failed. Error: {e}") raise HaCleanupException("Cluster cleanup failed") Log.info("cleanup command is successful")
def destroy_cluster(self, retry_index: int = 0, force=True): if retry_index < const.CLUSTER_RETRY_COUNT and not self._is_pcs_cluster_running( ): Log.warn('Cluster is not running, safe to destroy the cluster') if force: Log.warn('Executing cluster kill before destroy') self._execute.run_cmd(const.PCS_CLUSTER_KILL) output = self._execute.run_cmd(const.PCS_CLUSTER_DESTROY) Log.error(f"Cluster is destroyed. Output: {output}") return elif retry_index == 0: cluster_stop_response = self.stop() if cluster_stop_response: Log.warn( 'Successfully stopped the cluster, destroying the cluster') if not self._is_pcs_cluster_running(): output = self._execute.run_cmd(const.PCS_CLUSTER_DESTROY) Log.error(f"Cluster destroyed. Output: {output}") return Log.info('cluster is still running, wait for cluster to stop') time.sleep(const.BASE_WAIT_TIME) retry_index += 1 self.destroy_cluster(retry_index)
def start(self, nodeid: str) -> dict: """ Start node with nodeid. Args: nodeid (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ _node_status = self.nodes_status([nodeid])[nodeid] if _node_status == NODE_STATUSES.ONLINE.value: return { "status": const.STATUSES.SUCCEEDED.value, "msg": f"Node {nodeid}, is already in Online status" } elif _node_status == NODE_STATUSES.STANDBY.value or _node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value: # make node unstandby if self.heal_resource(nodeid): _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid), check_error=False) return { "status": const.STATUSES.IN_PROGRESS.value, "msg": f"Node {nodeid} : Node was in standby mode, " f"Unstandby operation started successfully" } else: Log.error( f"Node {nodeid} is in standby mode : Resource failcount found on the node, " f"cleanup not worked after 2 retries") return { "status": const.STATUSES.FAILED.value, "msg": f"Node {nodeid} is in standby mode: Resource " f"failcount found on the node cleanup not worked after 2 retries" } elif _node_status == NODE_STATUSES.CLUSTER_OFFLINE.value: _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_START.replace("<node>", nodeid), check_error=False) if _rc != 0: raise ClusterManagerError(f"Failed to start node {nodeid}") Log.info(f'Node: {nodeid} started successfully. Now, waiting for \ cluster to stabalize and then get the node status') time.sleep(const.BASE_WAIT_TIME * 2) # Get the status of the node again _node_status = self.nodes_status([nodeid])[nodeid] # If the node is in standby mode, unstandby here if _node_status == NODE_STATUSES.STANDBY.value: Log.warn(f'Node: {nodeid} is still in standby mode') _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", nodeid), check_error=False) if _rc != 0: raise ClusterManagerError( f"Failed to unstandby the node: {nodeid}") return { "status": const.STATUSES.IN_PROGRESS.value, "msg": f"Node {nodeid}: Node was in offline and then switched to standby mode, " f"Cluster started on node successfully" } return { "status": const.STATUSES.IN_PROGRESS.value, "msg": f"Node {nodeid} : Node was in cluster_offline mode, " f"Cluster started on node successfully" } elif _node_status == NODE_STATUSES.POWEROFF.value: # start node not in scope of VM Log.error("Operation not available for node type VM") raise ClusterManagerError( f"Node {nodeid} : Node was in poweroff mode, " "Node start : Operation not available for VM") else: Log.error( f"{nodeid} status is {_node_status}, node may not be started.") raise ClusterManagerError( f"Failed to start {nodeid} as found unhandled status {_node_status}" )
async def _generate_bundle(command): """ Initializes the process for Generating Support Bundle at shared path. command: Command Object :type: command return: None. """ # Get Arguments From Command bundle_id = command.options.get(const.SB_BUNDLE_ID) comment = command.options.get(const.SB_COMMENT) duration = command.options.get(const.SB_DURATION) size_limit = command.options.get(const.SB_SIZE) config_url = command.options.get('config_url') binlogs = command.options.get('binlogs') coredumps = command.options.get('coredumps') stacktrace = command.options.get('stacktrace') components = command.options.get('components') config_path = config_url.split('//')[1] if '//' in config_url else '' path = command.options.get('target_path') bundle_path = os.path.join(path, bundle_id) try: os.makedirs(bundle_path) except FileExistsError: raise BundleError( errno.EINVAL, "Bundle ID already exists," "Please use Unique Bundle ID") cluster_conf = MappedConf(config_url) # Get Node ID node_id = Conf.machine_id if node_id is None: raise BundleError(errno.EINVAL, "Invalid node_id: %s", \ node_id) # Update SB status in Filestore. # load conf for Support Bundle Conf.load(const.SB_INDEX, 'json://' + const.FILESTORE_PATH, skip_reload=True) data = { 'status': 'In-Progress', 'start_time': datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') } Conf.set(const.SB_INDEX, f'{node_id}>{bundle_id}', data) Conf.save(const.SB_INDEX) node_name = cluster_conf.get(f'node>{node_id}>name') Log.info(f'Starting SB Generation on {node_id}:{node_name}') # Get required SB size per component components_list, service_per_comp = SupportBundle._get_component_and_services( cluster_conf, node_id, components) if not components_list: Log.warn(f"No component specified for {node_name} in CORTX config") Log.warn(f"Skipping SB generation on node:{node_name}.") return num_components = len(components_list) size_limit_per_comp = SupportBundle.get_component_size_limit( size_limit, num_components) bundle_obj = Bundle(bundle_id=bundle_id, bundle_path=bundle_path, \ comment=comment,node_name=node_name, components=components_list, services=service_per_comp) # Start SB Generation on Node. # Adding CORTX manifest data inside support Bundle. try: # Copying config file into support bundle. common_locations = set() if config_path and os.path.exists(config_path): Log.info(f'For manifest data collection, taking config from \ {config_path} location.') # Remove secrets from the input config. conf_name = config_path.split('/')[-1] sb_config = config_path.replace(conf_name, 'sb_cluster.conf') with open(sb_config, 'w+') as sb_file: with open(config_path, 'r') as f: content = f.read() if 'secret:' in content: content = re.sub(r'secret:.+', r'secret: ****', content) sb_file.write(content) conf_target = os.path.join(bundle_path, 'common' + config_path) os.makedirs(conf_target.replace(f'/{conf_name}', ''), exist_ok=True) shutil.move(sb_config, conf_target) common_locations.add(config_path.split('/')[1]) # Copying "/etc/cortx/solution" directory into support bundle # except for "secret" folder. sln_target = os.path.join(bundle_path, 'common' + const\ .CORTX_SOLUTION_DIR) if os.path.exists(sln_target): shutil.rmtree(sln_target) if os.path.exists(const.CORTX_SOLUTION_DIR): _ = shutil.copytree(const.CORTX_SOLUTION_DIR, sln_target, \ ignore=shutil.ignore_patterns('secret')) common_locations.add(const.CORTX_SOLUTION_DIR.split('/')[1]) # Copying RELEASE.INFO file into support bundle. if os.path.exists(const.CORTX_RELEASE_INFO): rel_target = os.path.join(bundle_path, 'common' + const\ .CORTX_RELEASE_INFO) os.makedirs(rel_target.replace('/RELEASE.INFO', ''), exist_ok=True) shutil.copyfile(const.CORTX_RELEASE_INFO, rel_target) common_locations.add(const.CORTX_RELEASE_INFO.split('/')[1]) else: Log.warn(f'{const.CORTX_RELEASE_INFO} file not found.') # Adding node resources health into the support bundle. health_target = os.path.join(bundle_path, 'common' + '/health') os.makedirs(health_target, exist_ok=True) with open(health_target + '/node_health.json', 'w') as fp: info = {} info["resource_usage"] = {} info["resource_usage"]["cpu_usage"] = SupportBundle.\ get_cpu_overall_usage() info["resource_usage"]["uptime"] = SupportBundle.\ get_system_uptime() info["resource_usage"]["disk_usage"] = SupportBundle.\ get_disk_overall_usage() info["resource_usage"]["memory_usage"] = SupportBundle.\ get_mem_overall_usage() json.dump(info, fp, indent=4) common_locations.add('health') try: common_path = os.path.join(bundle_path, 'common') common_tar = os.path.join(common_path, 'common.tar.gz') with tarfile.open(common_tar, "w:gz") as tar: if os.path.exists(common_path): tar.add(common_path, arcname='common') # Deleting untar directories from the common folder. for location in common_locations: untar_location = os.path.join(common_path, location) if os.path.exists(untar_location): shutil.rmtree(untar_location) except (OSError, tarfile.TarError) as err: Log.error( "Facing issues while adding manifest data into common " "directory: {0}".format(err)) except BundleError as be: Log.error( f"Failed to add CORTX manifest data inside Support Bundle.{be}" ) try: await ComponentsBundle.init(bundle_obj, node_id, config_url, duration=duration, size_limit=size_limit_per_comp, binlogs=binlogs, coredumps=coredumps, stacktrace=stacktrace) except BundleError as be: Log.error(f"Bundle generation failed.{be}") except Exception as e: Log.error(f"Internal error, bundle generation failed {e}") if command.sub_command_name == 'generate': display_string_len = len(bundle_obj.bundle_id) + 4 response_msg = ( f"Please use the below bundle id for checking the status of support bundle." f"\n{'-' * display_string_len}" f"\n| {bundle_obj.bundle_id} |" f"\n{'-' * display_string_len}" f"\nPlease Find the file on -> {bundle_obj.bundle_path} .\n") return Response(output=response_msg, rc=OPERATION_SUCESSFUL) return bundle_obj
def check_resource_layout(self) -> bool: """ Check that all necessary resources are created. "Bad" resources are logged and skipped to check others. """ check_res = True for res, desc in self.layout.resources.items(): resource_list = [] try: name_list = [] # Counters is syntastic sugar to specify names for several # identical resources if desc[RESOURCE_ATTRIBUTES.PROVIDER][ RESOURCE_ATTRIBUTES.COUNTERS]: for counter in desc[RESOURCE_ATTRIBUTES.PROVIDER][ RESOURCE_ATTRIBUTES.COUNTERS]: name_list.append(f"{res}-{counter}") else: name_list = [res] for res_name in name_list: # Check that resource actually exists if desc[RESOURCE_ATTRIBUTES.HA][ RESOURCE_ATTRIBUTES. MODE] == RESOURCE_ATTRIBUTES.ACTIVE_ACTIVE: if desc[RESOURCE_ATTRIBUTES.GROUP] != "": resource = self.status.get_resource_from_cloned_group_by_name( res_name) resource_list.append(resource) else: resource = self.status.get_clone_resource_by_name( res_name) resource_list.extend(resource.copies) else: resource = self.status.get_unique_resource_by_name( res_name) resource_list.append(resource) if not resource_list: Log.info(f"Resource {res_name} not found in status") check_res = False continue except Exception: check_res = False continue for a_resource in resource_list: # Check provider and service expected = "{}:{}".format( desc[RESOURCE_ATTRIBUTES.PROVIDER][ RESOURCE_ATTRIBUTES.NAME], desc[RESOURCE_ATTRIBUTES.PROVIDER][ RESOURCE_ATTRIBUTES.SERVICE]) actual = a_resource.resource_agent if expected != actual: Log.info( f"{res}: invalid resource agent is used {actual} instead of {expected}" ) check_res = False try: if desc[RESOURCE_ATTRIBUTES.GROUP] != a_resource.group: Log.info( f'{res}: wrong group {a_resource.group} vs expected {desc[RESOURCE_ATTRIBUTES.GROUP]}' ) check_res = False except KeyError: Log.warn(f"{res} : Group is not defined.") # TODO: Location to be checked once component files become part of provisioning return check_res
async def init(command: List): """ Initializes the Process of Support Bundle Generation for Every Component. :param command: Csm_cli Command Object :type: command :return: """ # Fetch Command Arguments. Log.init("support_bundle", syslog_server="localhost", syslog_port=514, log_path=Conf.get("cortx_conf", "support>support_bundle_path"), level="INFO") bundle_id = command.options.get(const.SB_BUNDLE_ID, "") node_name = command.options.get(const.SB_NODE_NAME, "") comment = command.options.get(const.SB_COMMENT, "") components = command.options.get(const.SB_COMPONENTS, []) Log.debug(( f"{const.SB_BUNDLE_ID}: {bundle_id}, {const.SB_NODE_NAME}: {node_name}, " f" {const.SB_COMMENT}: {comment}, {const.SB_COMPONENTS}: {components}," f" {const.SOS_COMP}")) # Read Commands.Yaml and Check's If It Exists. cmd_setup_file = os.path.join(Conf.get("cortx_conf", "install_path"), "cortx/utils/conf/support_bundle.yaml") support_bundle_config = Yaml(cmd_setup_file).load() if not support_bundle_config: ComponentsBundle._publish_log(f"No such file {cmd_setup_file}", ERROR, bundle_id, node_name, comment) return None # Path Location for creating Support Bundle. path = os.path.join( Conf.get("cortx_conf", "support>support_bundle_path")) if os.path.isdir(path): try: shutil.rmtree(path) except PermissionError: Log.warn(f"Incorrect permissions for path:{path}") bundle_path = os.path.join(path, bundle_id) os.makedirs(bundle_path) # Start Execution for each Component Command. threads = [] command_files_info = support_bundle_config.get("COMPONENTS") # OS Logs are specifically generated hence here Even When All is Selected O.S. Logs Will Be Skipped. if components: if "all" not in components: components_list = list( set(command_files_info.keys()).intersection( set(components))) else: components_list = list(command_files_info.keys()) components_list.remove(const.SOS_COMP) Log.debug( f"Generating for {const.SB_COMPONENTS} {' '.join(components_list)}" ) for each_component in components_list: components_commands = [] components_files = command_files_info[each_component] for file_path in components_files: file_data = Yaml(file_path).load() if file_data: components_commands = file_data.get( const.SUPPORT_BUNDLE.lower(), []) if components_commands: thread_obj = threading.Thread( ComponentsBundle._exc_components_cmd( components_commands, bundle_id, f"{bundle_path}{os.sep}", each_component, node_name, comment)) thread_obj.start() Log.debug( f"Started thread -> {thread_obj.ident} Component -> {each_component}" ) threads.append(thread_obj) directory_path = Conf.get("cortx_conf", "support>support_bundle_path") tar_file_name = os.path.join(directory_path, f"{bundle_id}_{node_name}.tar.gz") ComponentsBundle._create_summary_file(bundle_id, node_name, comment, bundle_path) symlink_path = const.SYMLINK_PATH if os.path.exists(symlink_path): try: shutil.rmtree(symlink_path) except PermissionError: Log.warn(const.PERMISSION_ERROR_MSG.format(path=symlink_path)) os.makedirs(symlink_path, exist_ok=True) # Wait Until all the Threads Execution is not Complete. for each_thread in threads: Log.debug( f"Waiting for thread - {each_thread.ident} to complete process" ) each_thread.join(timeout=1800) try: Log.debug( f"Generating tar.gz file on path {tar_file_name} from {bundle_path}" ) Tar(tar_file_name).dump([bundle_path]) except Exception as e: ComponentsBundle._publish_log(f"Could not generate tar file {e}", ERROR, bundle_id, node_name, comment) return None try: Log.debug("Create soft-link for generated tar.") os.symlink( tar_file_name, os.path.join(symlink_path, f"{const.SUPPORT_BUNDLE}.{bundle_id}")) ComponentsBundle._publish_log( f"Tar file linked at location - {symlink_path}", INFO, bundle_id, node_name, comment) except Exception as e: ComponentsBundle._publish_log(f"Linking failed {e}", ERROR, bundle_id, node_name, comment) finally: if os.path.isdir(bundle_path): shutil.rmtree(bundle_path) msg = "Support bundle generation completed." ComponentsBundle._publish_log(msg, INFO, bundle_id, node_name, comment)
def stop(self) -> dict: """ Stop cluster and all service. It is Blocking call. Returns: ([dict]): Return dictionary. {"status": "", "msg":""} status: Succeeded, Failed, InProgress """ status: str = "" if not self._is_pcs_cluster_running(): raise ClusterManagerError( "Cluster not running on current node." "To stop cluster, It should be running on current node.") node_group: list = self._get_node_group() local_node: str = ConfigManager.get_local_node() Log.info( f"Node group for cluster start {node_group}, local node {local_node}" ) self_group: list = list( filter(lambda group: (local_node in group), node_group))[0] node_group.remove(self_group) offline_nodes = self._get_filtered_nodes( [NODE_STATUSES.POWEROFF.value]) # Stop cluster for other group for node_subgroup in node_group: for nodeid in node_subgroup: # Offline node can not be started without stonith. if nodeid not in offline_nodes: if self.heal_resource(nodeid): time.sleep(const.BASE_WAIT_TIME) res = json.loads( self._controllers[const.NODE_CONTROLLER].stop(nodeid)) Log.info(f"Stopping node {nodeid}, output {res}") if NODE_STATUSES.POWEROFF.value in res.get("msg"): offline_nodes.append(nodeid) Log.warn( f"Node {nodeid}, is in offline or lost from network." ) elif res.get("status") == const.STATUSES.FAILED.value: raise ClusterManagerError( f"Cluster Stop failed. Unable to stop {nodeid}") else: Log.info(f"Node {nodeid} stop is in progress.") else: Log.info( f"Node {nodeid}, is in offline or lost from network.") # Wait till resource will get stop. Log.info(f"Waiting, for {node_subgroup} to stop is in progress.") # Stop self group of cluster try: Log.info( f"Please Wait, trying to stop self node group: {self_group}") timeout = const.NODE_STOP_TIMEOUT * len(self_group) self._execute.run_cmd( const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout))) Log.info("Cluster stop completed.") except Exception as e: raise ClusterManagerError(f"Cluster stop failed. Error: {e}") status = "Cluster stop is in progress." if len(offline_nodes) != 0: status += f" Warning, Found {offline_nodes}, may be poweroff or not in network" return {"status": const.STATUSES.IN_PROGRESS.value, "msg": status}
def start(self, node_id: str, **op_kwargs) -> dict: """ Start node with the node_id. Args: node_id (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded, Failed, InProgress """ try: # Get the node_name (pvtfqdn) from node_id node_name = ConfigManager.get_node_name(node_id=node_id) self._is_node_in_cluster(node_id=node_name) node_status = self.nodes_status([node_name])[node_name] Log.debug(f"Node {node_name} cluster status is {node_status}") node_health = self._system_health.get_node_status( node_id=node_id).get("status") Log.debug(f"Node {node_name} health is {node_health}") if node_status == NODE_STATUSES.ONLINE.value and node_health == HEALTH_STATUSES.ONLINE.value: Log.debug(f"Node {node_name} is already online") return { "status": const.STATUSES.SUCCEEDED.value, "output": NODE_STATUSES.ONLINE.value, "error": "" } elif node_status == NODE_STATUSES.STANDBY.value or node_status == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value: # Unstandby the node if self.heal_resource(node_name): _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", node_name), check_error=False) if _rc != 0: Log.error( f"Failed to start node {node_name}, Error: {_err}") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Failed to start node {node_id}, Error: {_err}" } Log.debug( f"Node {node_name} was in standby mode, unstandby operation started successfully" ) else: Log.error( f"Node {node_name} is in standby mode : Resource failcount found on the node, cleanup did not work" ) return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} is in standby mode, resource failcount found on the node, cleanup did not work" } elif node_status == NODE_STATUSES.CLUSTER_OFFLINE.value: _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_START.replace("<node>", node_name), check_error=False) if _rc != 0: Log.error( f"Failed to start node {node_name}, Error: {_err}") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Failed to start node {node_id}, Error: {_err}" } Log.debug( f"Node {node_name} started successfully. Waiting for cluster to stabalize and then get the node status" ) time.sleep(const.BASE_WAIT_TIME * 2) # Get the status of the node again node_status = self.nodes_status([node_name])[node_name] # If the node is in standby mode, unstandby here if node_status == NODE_STATUSES.STANDBY.value: Log.warn(f'Node {node_name} is still in standby mode') _output, _err, _rc = self._execute.run_cmd( const.PCS_NODE_UNSTANDBY.replace("<node>", node_name), check_error=False) if _rc != 0: Log.error( f"Failed to start node {node_name}, Error: {_err}") return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Failed to start node {node_id}, Error: {_err}" } else: Log.error( f"{node_name} status is {node_status}, node cannot be started." ) return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} status is {node_status}, node cannot be started." } # TODO: Update the storage enclosure status in system health. # Update the node status in system health self._update_health(const.COMPONENTS.NODE.value, node_id, HEALTH_EVENTS.FAULT_RESOLVED.value) return { "status": const.STATUSES.SUCCEEDED.value, "output": NODE_STATUSES.ONLINE.value, "error": "" } except Exception as e: Log.error(f"Failed to start node {node_id}") raise ClusterManagerError( f"Failed to start node {node_id}, Error {e}")
def stop(self, sync=False, timeout=30) -> dict: """ Stop cluster and all service. Args: sync (bool, optional): if sync is True then stop will check the status for timeout seconds. timeout (int, optional): timeout(in seconds) can be specified for sync=True otherwise ignored. Returns: ([dict]): Return dictionary. {"status": "", "output":"", "error":""} status: Succeeded, Failed, InProgress """ status: str = "" if not self._is_pcs_cluster_running(): raise ClusterManagerError( "Cluster not running on current node." "To stop cluster, It should be running on current node.") node_group: list = self._get_node_group() local_node: str = ConfigManager.get_local_node() Log.info( f"Node group for cluster start {node_group}, local node {local_node}" ) self_group: list = list( filter(lambda group: (local_node in group), node_group))[0] node_group.remove(self_group) offline_nodes = self._get_filtered_nodes( [NODE_STATUSES.POWEROFF.value]) # Stop cluster for other group for node_subgroup in node_group: for node_name in node_subgroup: # Offline node can not be started without stonith. if node_name not in offline_nodes: if self.heal_resource(node_name): time.sleep(const.BASE_WAIT_TIME) node_id = ConfigManager.get_node_id(node_name) res = json.loads( self._controllers[const.NODE_CONTROLLER].stop(node_id)) Log.info(f"Stopping node {node_id}, output {res}") if NODE_STATUSES.POWEROFF.value in res.get("output"): offline_nodes.append(node_id) Log.warn( f"Node {node_id}, is in offline or lost from network." ) elif res.get("status") == const.STATUSES.FAILED.value: raise ClusterManagerError( f"Cluster Stop failed. Unable to stop {node_id}") else: Log.info(f"Node {node_id} stop is in progress.") else: Log.info( f"Node {node_name}, is in offline or lost from network." ) # Wait till resource will get stop. Log.info(f"Waiting, for {node_subgroup} to stop is in progress.") # Stop self group of cluster try: Log.info( f"Please Wait, trying to stop self node group: {self_group}") timeout = const.NODE_STOP_TIMEOUT * len(self_group) self._execute.run_cmd( const.PCS_STOP_CLUSTER.replace("<seconds>", str(timeout))) Log.info("Cluster stop completed.") except Exception as e: raise ClusterManagerError(f"Cluster stop failed. Error: {e}") status = "Cluster stop is in progress." if len(offline_nodes) != 0: status += f" Warning, Found {offline_nodes}, may be poweroff or not in network" if sync: timeout = timeout - const.BASE_WAIT_TIME * const.PCS_NODE_GROUP_SIZE * len( node_group) in_expected_state = self._verify_expected_cluster_status( const.CLUSTER_STATUS.OFFLINE, timeout) if in_expected_state: return { "status": const.STATUSES.SUCCEEDED.value, "output": "Cluster is offline.", "error": "" } else: return { "status": const.STATUSES.FAILED.value, "output": "Retry suggested.", "error": "Operation timed out." } return { "status": const.STATUSES.IN_PROGRESS.value, "output": status, "error": "" }