def alert_job_status(curr_value, msg, integration_id=None, cluster_name=None): alert = {} alert['source'] = NS.publisher_id alert['classification'] = 'cluster' alert['pid'] = os.getpid() alert['time_stamp'] = tendrl_now().isoformat() alert['alert_type'] = 'STATUS' severity = "INFO" if curr_value.lower() == "failed": severity = "WARNING" alert['severity'] = severity alert['resource'] = 'job_status' alert['current_value'] = curr_value alert['tags'] = dict( message=msg, integration_id=integration_id or NS.tendrl_context.integration_id, cluster_name=cluster_name or NS.tendrl_context.cluster_name, sds_name=NS.tendrl_context.sds_name, fqdn=NS.node_context.fqdn ) alert['node_id'] = NS.node_context.node_id if not NS.node_context.node_id: return logger.log( "notice", "alerting", {'message': json.dumps(alert)} )
def format_alert(self, alert_json): alert = self.parse_alert_metrics(alert_json) try: alert["alert_id"] = None alert["node_id"] = utils.find_node_id( alert['tags']['integration_id'], alert['tags']['fqdn']) alert["time_stamp"] = tendrl_now().isoformat() alert["resource"] = self.representive_name alert['alert_type'] = constants.ALERT_TYPE alert['significance'] = constants.SIGNIFICANCE_HIGH alert['pid'] = utils.find_grafana_pid() alert['source'] = constants.ALERT_SOURCE alert['tags']['fqdn'] = alert['tags']['fqdn'] if alert_json['State'] == constants.GRAFANA_ALERT: if "critical" in alert_json['Name'].lower(): alert['severity'] = \ constants.TENDRL_SEVERITY_MAP['critical'] else: alert['severity'] = \ constants.TENDRL_SEVERITY_MAP['warning'] alert['tags']['message'] = ( "Cpu utilization on node %s in %s" " at %s %% and running out of cpu" % (alert['tags']['fqdn'], alert['tags']['cluster_short_name'], alert['current_value'])) elif alert_json['State'] == constants.GRAFANA_CLEAR_ALERT: # Identifying clear alert from which panel critical/warning if "critical" in alert_json['Name'].lower(): alert['tags']['clear_alert'] = \ constants.TENDRL_SEVERITY_MAP['critical'] elif "warning" in alert_json['Name'].lower(): alert['tags']['clear_alert'] = \ constants.TENDRL_SEVERITY_MAP['warning'] alert['severity'] = constants.TENDRL_SEVERITY_MAP['info'] alert['tags']['message'] = \ ("Cpu utilization on node %s in" " %s back to normal" % ( alert['tags']['fqdn'], alert['tags']['cluster_short_name'])) else: logger.log( "error", NS.publisher_id, { "message": "Unsupported alert %s " "severity" % alert_json }) raise InvalidAlertSeverity return alert except (KeyError, CalledProcessError, EtcdKeyNotFound, NodeNotFound, InvalidAlertSeverity) as ex: Event( ExceptionMessage( "debug", NS.publisher_id, { "message": "Error in converting grafana" "alert into tendrl alert %s" % alert_json, "exception": ex }))
def get_node_status(self, node_id): last_seen_at = central_store_util.get_node_last_seen_at(node_id) if last_seen_at: interval = (tendrl_now() - datetime.datetime.strptime( last_seen_at[:-6], "%Y-%m-%dT%H:%M:%S.%f").replace(tzinfo=utc)).total_seconds() if interval < 5: return pm_consts.STATUS_UP else: return pm_consts.STATUS_DOWN return pm_consts.STATUS_NOT_MONITORED
def emit_event(resource, curr_value, msg, instance, severity, alert_notify=False, tags={}, integration_id=None, cluster_name=None, sds_name=None, node_id=None): alert = {} alert['source'] = NS.publisher_id alert['node_id'] = node_id alert['pid'] = os.getpid() alert['time_stamp'] = tendrl_now().isoformat() alert['alert_type'] = 'STATUS' alert['severity'] = severity alert['resource'] = resource alert['current_value'] = curr_value alert['tags'] = dict( plugin_instance=instance, message=msg, integration_id=integration_id or NS.tendrl_context.integration_id, cluster_name=cluster_name or NS.tendrl_context.cluster_name ) if "entity_type" in tags: if tags["entity_type"] == BRICK_ENTITY: alert['node_id'] = tags.get( "node_id", NS.node_context.node_id ) alert['tags']['fqdn'] = tags.get( "fqdn", NS.node_context.fqdn ) alert['tags']['volume_name'] = tags.get( 'volume_name', None ) elif tags["entity_type"] == VOLUME_ENTITY: alert['tags']['volume_name'] = tags.get( 'volume_name', None ) payload = {'message': json.dumps(alert)} payload['alert_condition_state'] = severity payload['alert_condition_status'] = resource if alert_notify: payload['alert_notify'] = alert_notify if severity == "INFO": payload['alert_condition_unset'] = True else: payload['alert_condition_unset'] = False logger.log( "notice", "alerting", payload, integration_id=integration_id )
def emit_event(resource, curr_value, msg, instance, severity, alert_notify=False, tags={}, integration_id=None, cluster_name=None, sds_name=None, node_id=None): alert = {} alert['source'] = NS.publisher_id alert['node_id'] = node_id alert['pid'] = os.getpid() alert['time_stamp'] = tendrl_now().isoformat() alert['alert_type'] = 'STATUS' alert['severity'] = severity alert['resource'] = resource alert['current_value'] = curr_value alert['tags'] = dict( plugin_instance=instance, message=msg, integration_id=integration_id or NS.tendrl_context.integration_id, cluster_name=cluster_name or NS.tendrl_context.cluster_name, sds_name=sds_name or NS.tendrl_context.sds_name, ) if "entity_type" in tags: if tags["entity_type"] == BRICK_ENTITY: alert['node_id'] = tags.get("node_id", NS.node_context.node_id) alert['tags']['fqdn'] = tags.get("fqdn", NS.node_context.fqdn) alert['tags']['volume_name'] = tags.get('volume_name', None) elif tags["entity_type"] == VOLUME_ENTITY: alert['tags']['volume_name'] = tags.get('volume_name', None) payload = {'message': json.dumps(alert)} payload['alert_condition_state'] = severity payload['alert_condition_status'] = resource if alert_notify: payload['alert_notify'] = alert_notify if severity == "INFO": payload['alert_condition_unset'] = True else: payload['alert_condition_unset'] = False logger.log("notice", "alerting", payload)
def _emit_event(self, resource, curr_value, msg, instance): alert = {} alert['source'] = NS.publisher_id alert['pid'] = os.getpid() alert['time_stamp'] = tendrl_now().isoformat() alert['alert_type'] = 'status' severity = "INFO" if curr_value.lower() == "stopped": severity = "CRITICAL" alert['severity'] = severity alert['resource'] = resource alert['current_value'] = curr_value alert['tags'] = dict(plugin_instance=instance, message=msg, cluster_id=NS.tendrl_context.integration_id, cluster_name=NS.tendrl_context.cluster_name, sds_name=NS.tendrl_context.sds_name, fqdn=socket.getfqdn()) alert['node_id'] = NS.node_context.node_id if not NS.node_context.node_id: return Event(Message("notice", "alerting", {'message': json.dumps(alert)}))
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and _cluster.current_job['status'] == 'failed') or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: index = 1 volumes = raw_data['Volumes'] # instantiating blivet class, this will be used for # getting brick_device_details b = blivet.Blivet() # reset blivet during every sync to get latest information # about storage devices in the machine b.reset() devicetree = b.devicetree total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._enable_disable_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
from tendrl.commons.utils.time_utils import now as tendrl_now tendrl_collectd_severity_map = { 'FAILURE': 'CRITICAL', 'WARNING': 'WARNING', 'OK': 'INFO', 'OKAY': 'INFO' } config = load_config('node-monitoring', '/etc/tendrl/node-monitoring/node-monitoring.conf.yaml') central_store = etcd_client(host=config['etcd_connection'], port=config['etcd_port']) timestamp = tendrl_now().isoformat() if is_collectd_imported: sys.path.append('/usr/lib64/collectd') '''Collectd forks an instance of this plugin per threshold breach detected Read collectd detected threshold breach details from standard input of current fork.''' def get_notification(): collectd_alert = {} is_end_of_dictionary = False for line in sys.stdin: if not line.strip(): is_end_of_dictionary = True continue
def update_last_seen_at(): etcd_utils.write( '/monitoring/nodes/%s/last_seen_at' % NS.node_context.node_id, tendrl_now().isoformat())
def run(self): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] # Default is 260 seconds SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 250 Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "%s running" % self.__class__.__name__})) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() try: etcd_utils.read("clusters/%s/" "cluster_network" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: try: node_networks = etcd_utils.read("nodes/%s/Networks" % NS.node_context.node_id) # TODO(team) this logic needs to change later # multiple networks supported for gluster use case node_network = NS.tendrl.objects.NodeNetwork( interface=node_networks.leaves.next().key.split( '/')[-1]).load() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() cluster.cluster_network = node_network.subnet cluster.save() except etcd.EtcdKeyNotFound as ex: Event( Message(priority="error", publisher=NS.publisher_id, payload={ "message": "Failed to sync cluster network details" })) _sleep = 0 while not self._complete.is_set(): if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() if _cluster.import_status == "failed": continue try: NS._int.wclient.write("clusters/%s/" "sync_status" % NS.tendrl_context.integration_id, "in_progress", prevExist=False) except (etcd.EtcdAlreadyExist, etcd.EtcdCompareFailed) as ex: pass subprocess.call([ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ]) raw_data = ini2json.ini_to_dict('/var/run/glusterd-state') subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call([ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ]) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts') subprocess.call( ['rm', '-rf', '/var/run/glusterd-state-vol-opts']) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] while True: try: peer = NS.gluster.\ objects.Peer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = NS._int.client.read( "clusters/%s/Peers/%s/connected" % (NS.tendrl_context.integration_id, peers['peer%s.uuid' % index])).value current_status = peers['peer%s.connected' % index] if stored_peer_status != "" and \ current_status != stored_peer_status: msg = ( "Status of peer: %s in cluster %s " "changed from %s to %s") % ( peers['peer%s.primary_hostname' % index], NS.tendrl_context.integration_id, stored_peer_status, current_status) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO') except etcd.EtcdKeyNotFound: pass peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break if "Volumes" in raw_data: index = 1 volumes = raw_data['Volumes'] while True: try: sync_volumes( volumes, index, raw_data_options.get('Volume Options')) index += 1 except KeyError: break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) NS.gluster.objects.VolumeOptions(vol_id=vol_id, options=dict1).save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.gluster.objects.Volume().load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true": volumes.append(volume) cluster_status.sync_cluster_status(volumes) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() evt.process_events() rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get("sync_interval", 10)) + len(volumes) * 10) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id) if _cluster.exists(): _cluster = _cluster.load() _cluster.sync_status = "done" _cluster.last_sync = str(tendrl_now()) _cluster.is_managed = "yes" _cluster.save() # Initialize alert count try: alerts_count_key = '/clusters/%s/alert_counters' % ( NS.tendrl_context.integration_id) etcd_utils.read(alerts_count_key) except (etcd.EtcdException) as ex: if type(ex) == etcd.EtcdKeyNotFound: ClusterAlertCounters( integration_id=NS.tendrl_context.integration_id ).save() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._enable_disable_volume_profiling() except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "gluster sds state sync error", "exception": ex })) try: etcd_utils.read('/clusters/%s/_sync_now' % NS.tendrl_context.integration_id) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) Event( Message( priority="debug", publisher=NS.publisher_id, payload={"message": "%s complete" % self.__class__.__name__}))
def format_alert(self, alert_json): alert = self.parse_alert_metrics(alert_json) try: alert["alert_id"] = None alert["node_id"] = utils.find_node_id( alert['tags']['integration_id'], alert['tags']['fqdn']) alert["time_stamp"] = tendrl_now().isoformat() alert["resource"] = self.representive_name alert['alert_type'] = constants.ALERT_TYPE alert['significance'] = constants.SIGNIFICANCE_HIGH alert['pid'] = utils.find_grafana_pid() alert['source'] = constants.ALERT_SOURCE alert['tags']['cluster_name'] = utils.find_cluster_name( alert['tags']['integration_id']) alert["tags"]["volume_name"] = utils.find_volume_name( alert['tags']['integration_id'], alert['tags']['fqdn'].replace('_', '.'), alert['tags']['brick_path'].strip(":").replace( grafana_constants.BRICK_PATH_SEPARATOR, '_')) if alert_json['State'] == constants.GRAFANA_ALERT: if "critical" in alert_json['Name'].lower(): alert['severity'] = \ constants.TENDRL_SEVERITY_MAP['critical'] else: alert['severity'] = \ constants.TENDRL_SEVERITY_MAP['warning'] # Modify brick path symbol to slash(/) in alert message alert['tags']['message'] = ( "Brick utilization on %s:%s in %s " "at %s %% and nearing full capacity" % (alert['tags']['fqdn'], alert['tags']['brick_path'].replace( grafana_constants.BRICK_PATH_SEPARATOR, "/"), alert["tags"]["volume_name"], alert['current_value'])) elif alert_json['State'] == constants.GRAFANA_CLEAR_ALERT: # Identifying clear alert from which panel critical/warning if "critical" in alert_json['Name'].lower(): alert['tags']['clear_alert'] = \ constants.TENDRL_SEVERITY_MAP['critical'] elif "warning" in alert_json['Name'].lower(): alert['tags']['clear_alert'] = \ constants.TENDRL_SEVERITY_MAP['warning'] alert['severity'] = constants.TENDRL_SEVERITY_MAP['info'] # Modify brick path symbol to slash(/) in alert message alert['tags']['message'] = ( "Brick utilization of %s:%s in %s " "back to normal" % (alert['tags']['fqdn'], alert['tags']['brick_path'].replace( grafana_constants.BRICK_PATH_SEPARATOR, "/"), alert["tags"]["volume_name"])) else: logger.log( "error", NS.publisher_id, { "message": "Unsupported alert %s " "severity" % alert_json }) raise InvalidAlertSeverity return alert except (KeyError, CalledProcessError, EtcdKeyNotFound, NodeNotFound, InvalidAlertSeverity) as ex: Event( ExceptionMessage( "debug", NS.publisher_id, { "message": "Error in converting grafana" "alert into tendrl alert %s" % alert_json, "exception": ex }))
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and ( _cluster.current_job['status'] == 'failed')) or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": time.sleep(_sleep) continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: # create devicetree using lsblk devicetree = get_device_tree() # find lvs lvs = brick_utilization.get_lvs() index = 1 volumes = raw_data['Volumes'] total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree, lvs ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._update_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def update_last_seen_at(): NS._int.wclient.write( '/monitoring/nodes/%s/last_seen_at' % NS.node_context.node_id, tendrl_now().isoformat() )