예제 #1
0
 def setUp(self):
     self.event_manager = EventManager.get_instance()
     self.component = SUBSCRIPTION_LIST.TEST
     self.event = SubscribeEvent(RESOURCE_TYPES.NODE, [
         RESOURCE_STATUS.FAILED, RESOURCE_STATUS.ONLINE,
         RESOURCE_STATUS.DEGRADED, RESOURCE_STATUS.OFFLINE
     ])
예제 #2
0
 def _create_listener(self) -> EventListener:
     host = self.cns.get_local_nodename()
     group = f'hare_{host}'
     # group_id stands to Kafka group of consumers
     #
     # Here we make sure that different hax instances use different groups.
     # That means that ack's executed from one hax instance will not affect
     # the messages that another hax instance receives (so every hax reads
     # the whole history of messages even if they process the messages with
     # different speed).
     return EventListener(
         [SubscribeEvent('node', ['offline', 'online', 'failed'])],
         group_id=group)
예제 #3
0
def main():
    component = "hare"
    resource_type = "node"
    state = "offline"
    # import pudb.remote
    # pudb.remote.set_trace(term_size=(130, 40), port=9998)

    # Before submitting a fake event, we need to register the component
    # (just to make sure that the message will be sent)
    EventManager.get_instance().subscribe(
        component, [SubscribeEvent(resource_type, [state])])
    handler = NodeActionHandler()

    event = HealthEvent("event_id", HEALTH_STATUSES.OFFLINE.value, "severity",
                        "1", "1", "e766bd52-c19c-45b6-9c91-663fd8203c2e",
                        "storage-set-1", "localhost", "srvnode-1.mgmt.public",
                        "node", "16215009572", "iem", "Description")
    handler.publish_event(event)
예제 #4
0
    def process(self):
        """
        Process config command.
        """
        try:
            # Get log path from cluster.conf.
            log_path = Conf.get(
                self._index, f'cortx{_DELIM}common{_DELIM}storage{_DELIM}log')
            machine_id = Conf.machine_id
            ha_log_path = os.path.join(log_path, f'ha/{machine_id}')

            consul_endpoints = Conf.get(
                self._index,
                f'cortx{_DELIM}external{_DELIM}consul{_DELIM}endpoints')
            #========================================================#
            # consul Service endpoints from cluster.conf             #
            #____________________ cluster.conf ______________________#
            # endpoints:                                             #
            # - tcp://consul-server.default.svc.cluster.local:8301   #
            # - http://consul-server.default.svc.cluster.local:8500  #
            #========================================================#
            # search for supported consul endpoint url from list of configured consul endpoints
            filtered_consul_endpoints = list(
                filter(
                    lambda x: isinstance(x, str) and urlparse(x).scheme ==
                    const.consul_scheme, consul_endpoints))
            if not filtered_consul_endpoints:
                sys.stderr.write(
                    f'Failed to get consul config. consul_config: {filtered_consul_endpoints}. \n'
                )
                sys.exit(1)
            # discussed and confirmed to select the first hhtp endpoint
            consul_endpoint = filtered_consul_endpoints[0]

            kafka_endpoint = Conf.get(
                self._index,
                f'cortx{_DELIM}external{_DELIM}kafka{_DELIM}endpoints')
            if not kafka_endpoint:
                sys.stderr.write(
                    f'Failed to get kafka config. kafka_config: {kafka_endpoint}. \n'
                )
                sys.exit(1)

            health_comm_msg_type = FAULT_TOLERANCE_KEYS.MONITOR_HA_MESSAGE_TYPE.value

            conf_file_dict = {
                'LOG': {
                    'path': ha_log_path,
                    'level': const.HA_LOG_LEVEL
                },
                'consul_config': {
                    'endpoint': consul_endpoint
                },
                'kafka_config': {
                    'endpoints': kafka_endpoint
                },
                'event_topic': 'hare',
                'MONITOR': {
                    'message_type': health_comm_msg_type,
                    'producer_id': 'cluster_monitor'
                },
                'EVENT_MANAGER': {
                    'message_type': 'health_events',
                    'producer_id': 'system_health',
                    'consumer_group': 'health_monitor',
                    'consumer_id': '1'
                },
                'FAULT_TOLERANCE': {
                    'message_type': health_comm_msg_type,
                    'consumer_group': 'event_listener',
                    'consumer_id': '1'
                },
                'CLUSTER_STOP_MON': {
                    'message_type': 'cluster_stop',
                    'consumer_group': 'cluster_mon',
                    'consumer_id': '2'
                },
                'CLUSTER': {
                    'resource_type': ['node', 'disk', 'cvg', 'cluster']
                },
                'SYSTEM_HEALTH': {
                    'num_entity_health_events': 2
                }
            }

            if not os.path.isdir(const.CONFIG_DIR):
                os.mkdir(const.CONFIG_DIR)

            # Open config file and dump yaml data from conf_file_dict
            with open(const.HA_CONFIG_FILE, 'w+') as conf_file:
                yaml.dump(conf_file_dict, conf_file, default_flow_style=False)

            Cmd.copy_file(const.SOURCE_HEALTH_HIERARCHY_FILE,
                          const.HEALTH_HIERARCHY_FILE)
            # First populate the ha.conf and then do init. Because, in the init, this file will
            # be stored in the confstore as key values
            ConfigManager.init("ha_setup")

            # Inside cluster.conf, cluster_id will be present under
            # "node".<actual POD machind id>."cluster_id". So,
            # in the similar way, confstore will have this key when
            # the cluster.conf load will taked place.
            # So, to get the cluster_id field from Confstore, we need machine_id
            self._cluster_id = Conf.get(
                self._index, f'node{_DELIM}{machine_id}{_DELIM}cluster_id')
            # site_id = Conf.get(self._index, f'node{_DELIM}{machine_id}{_DELIM}site_id')
            self._site_id = NOT_DEFINED
            # rack_id = Conf.get(self._index, f'node{_DELIM}{machine_id}{_DELIM}rack_id')
            self._rack_id = NOT_DEFINED
            self._storageset_id = NOT_DEFINED
            conf_file_dict.update({
                'COMMON_CONFIG': {
                    'cluster_id': self._cluster_id,
                    'rack_id': self._rack_id,
                    'site_id': self._site_id
                }
            })
            # TODO: Verify whether these newly added config is avilable in the confstore or not
            with open(const.HA_CONFIG_FILE, 'w+') as conf_file:
                yaml.dump(conf_file_dict, conf_file, default_flow_style=False)
            self._confstore = ConfigManager.get_confstore()

            Log.info(
                f'Populating the ha config file with consul_endpoint: {consul_endpoint}'
            )

            Log.info('Performing event_manager subscription')
            event_manager = EventManager.get_instance()
            event_manager.subscribe(
                const.EVENT_COMPONENT,
                [SubscribeEvent(const.POD_EVENT, ["online", "failed"])])
            Log.info(f'event_manager subscription for {const.EVENT_COMPONENT}\
                       is successful for the event {const.POD_EVENT}')
            event_manager.subscribe(
                const.EVENT_COMPONENT,
                [SubscribeEvent(const.DISK_EVENT, ["online", "failed"])])
            Log.info(f'event_manager subscription for {const.EVENT_COMPONENT}\
                       is successful for the event {const.DISK_EVENT}')

            Log.info('Creating cluster cardinality')
            self._confStoreAPI = ConftStoreSearch()
            self._confStoreAPI.set_cluster_cardinality(self._index)

            # Init cluster,site,rack health
            self._add_cluster_component_health()
            # Init node health
            self._add_node_health()
            # Init cvg and disk health
            self._add_cvg_and_disk_health()

            Log.info("config command is successful")
            sys.stdout.write("config command is successful.\n")
        except TypeError as type_err:
            sys.stderr.write(
                f'HA config command failed: Type mismatch: {type_err}.\n')
        except yaml.YAMLError as exc:
            sys.stderr.write(
                f'Ha config failed. Invalid yaml configuration: {exc}.\n')
        except OSError as os_err:
            sys.stderr.write(f'HA Config failed. OS_error: {os_err}.\n')
        except Exception as c_err:
            sys.stderr.write(f'HA config command failed: {c_err}.\n')
    print("MESSAGE: ", message)
    global MSG
    MSG = True
    return CONSUMER_STATUS.SUCCESS_STOP


if __name__ == '__main__':
    try:
        print("********Event Publisher********")
        event_manager = EventManager.get_instance()
        k8s_filter = K8SFilter()
        component = "hare"
        resource_type = K8S_ALERT_RESOURCE_TYPE.RESOURCE_TYPE_POD.value
        state = K8S_ALERT_STATUS.STATUS_FAILED.value
        message_type = event_manager.subscribe(
            'hare', [SubscribeEvent(resource_type, [state])])
        print(f"Subscribed {component}, message type is {message_type}")
        k8s_event = K8SAlert("cortx", "node2", "cortx-data123",
                             K8S_ALERT_STATUS.STATUS_FAILED.value,
                             K8S_ALERT_RESOURCE_TYPE.RESOURCE_TYPE_POD.value,
                             "16215909572")

        timestamp = str(int(time.time()))
        event_id = timestamp + str(uuid.uuid4().hex)
        event_type = k8s_event.status
        if k8s_filter.filter_event(json.dumps(k8s_event.__dict__)):
            health_event = HealthEvent(event_id, event_type,
                                       EVENT_SEVERITIES.CRITICAL.value, "1",
                                       "1", "1", "1", "srvnode_1", "srvnode_1",
                                       "pod", "16215909572", "cortx-data-pod",
                                       {"namespace": "cortx"})