def get_net_storage_utilization(self, node): try: used_stats = get_latest_stats(node, 'df-*.df_complex-used') used = 0.0 for stat in used_stats: if not math.isnan(float(stat)): used = used + float(stat) free_stats = get_latest_stats(node, 'df-*.df_complex-free') free = 0.0 for stat in free_stats: if not math.isnan(float(stat)): free = free + float(stat) if free + used == 0: return None percent_used = float(used * 100) / float(free + used) node_name = central_store_util.get_node_name_from_id(node) NS.time_series_db_manager.get_plugin().push_metrics( NS.time_series_db_manager.get_timeseriesnamefromresource( underscored_node_name=node_name.replace('.', '_'), resource_name=pm_consts.STORAGE, utilization_type=pm_consts.PERCENT_USED), percent_used) return { 'used': str(used), 'total': str(used + free), 'percent_used': str(percent_used), 'updated_at': datetime.datetime.now().isoformat() } except TendrlPerformanceMonitoringException: # Exception already handled return None
def get_latest_node_stat(node, resource): try: node_name = central_store_util.get_node_name_from_id(node) return get_latest_stat(node_name, resource) except (ValueError, urllib3.exceptions.HTTPError, TendrlPerformanceMonitoringException) as ex: raise ex
def get_nodestats(node_id, resource_name): try: node_name = central_store_util.get_node_name_from_id(node_id) start_time = None end_time = None time_interval = None if len(request.args.items()) > 0: for request_param in request.args.items(): if request_param[0] == "start_time": start_time = request_param[1] elif request_param[0] == "end_time": end_time = request_param[1] elif request_param[0] == "interval": time_interval = request_param[1] return Response( NS.time_series_db_manager.\ get_plugin().\ get_metric_stats( node_name, resource_name, time_interval=time_interval, start_time=start_time, end_time=end_time ), status=200, mimetype='application/json' ) except (ValueError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, SyntaxError, etcd.EtcdException, TypeError, TendrlPerformanceMonitoringException) as ex: return Response(str(ex), status=500, mimetype='application/json')
def get_latest_stats(node, resource): try: node_name = central_store_util.get_node_name_from_id(node) stats = NS.time_series_db_manager.get_plugin().get_metric_stats( node_name, resource, 'latest' ) if stats == "[]" or not stats: raise TendrlPerformanceMonitoringException( 'Stats not yet available in time series db' ) return re.findall('Current:(.+?)Max', stats) except TendrlPerformanceMonitoringException as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": 'Failed to get latest stats of %s of ' 'node %s for node summary.' % (resource, node), "exception": ex } ) ) raise ex
def get_latest_node_stat(node, resource): try: node_name = central_store_util.get_node_name_from_id( node ) return get_latest_stat(node_name, resource) except TendrlPerformanceMonitoringException as ex: raise ex
def get_stat_types(node_id): try: node_name = central_store_util.get_node_name_from_id(node_id) return Response( NS.time_series_db_manager.get_plugin().get_metrics(node_name), status=200, mimetype='application/json') except (ValueError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, SyntaxError, etcd.EtcdException, TypeError, TendrlPerformanceMonitoringException) as ex: return Response(str(ex), status=500, mimetype='application/json')
def get_node_disk_iops_stats(self, node_id, time_interval=None, start_time=None, end_time=None): node_name = central_store_util.get_node_name_from_id(node_id) node_name = node_name.replace('.', '_') target = Template( 'sumSeries(averageSeries($prefix.$node_name.disk-*.disk_ops.write' '), averageSeries($prefix.$node_name.disk-*.disk_ops.read))' ).substitute( prefix=self.prefix, node_name=node_name, ) target = urllib.quote(target) if time_interval: if time_interval == 'latest': target = "cactiStyle(%s)" % target else: start_time = self.parse_time(time_interval) if start_time: start_time = self.parse_time(start_time) if end_time: end_time = self.parse_time(end_time) url = 'http://%s:%s/render?target=%s&format=json' % ( self.host, str(self.port), target) if start_time: url = "%s&from=%s" % (url, start_time) if end_time: url = "%s&until=%s" % (url, end_time) try: stats = self.http.request('GET', url, timeout=5) if stats.status == 200: # TODO(Anmol): remove nulls from graphite data before returning # data. Explore the possibility of achieving this using some # tuning factor in graphite. data = re.sub('\[null, [0-9]+\], ', '', stats.data) data = re.sub(', \[null, [0-9]+\]', '', data) return data else: TendrlPerformanceMonitoringException( 'Request status code: %s' % str(stats.status)) except (ValueError, Exception) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch %s stats using url %s' '. Error %s' % (target, url), "exception": ex })) raise TendrlPerformanceMonitoringException(str(ex))
def get_node_brick_status_counts(self, node_id): node_name = central_store_util.get_node_name_from_id(node_id) ip_indexes = etcd_read_key('/indexes/ip') node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: cluster_id = central_store_util.get_node_cluster_id(node_id) if cluster_id: volumes_det = self.get_cluster_volumes(cluster_id) for volume_id, volume_det in volumes_det.iteritems(): for brick_path, brick_det in volume_det.get( 'Bricks', {}).iteritems(): if (brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip): if brick_det['status'] == 'Stopped': brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[pm_consts.CRITICAL_ALERTS] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[pm_consts.WARNING_ALERTS] = count except Exception as ex: Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex })) return brick_status_wise_counts
def get_net_host_cpu_utilization(self, node): try: percent_user = get_latest_node_stat(node, 'cpu.percent-user') percent_system = get_latest_node_stat(node, 'cpu.percent-system') node_name = central_store_util.get_node_name_from_id(node) NS.time_series_db_manager.get_plugin().push_metrics( NS.time_series_db_manager.get_timeseriesnamefromresource( underscored_node_name=node_name.replace('.', '_'), resource_name=pm_consts.CPU, utilization_type=pm_consts.PERCENT_USED), percent_user + percent_system) return { 'percent_used': str(percent_user + percent_system), 'updated_at': datetime.datetime.now().isoformat() } except TendrlPerformanceMonitoringException: # Exception already handled return None
def calculate_host_summary(self, node): gevent.sleep(0.1) cpu_usage = self.get_net_host_cpu_utilization(node) memory_usage = self.get_net_host_memory_utilization(node) storage_usage = self.get_net_storage_utilization(node) swap_usage = self.get_net_host_swap_utilization(node) alert_count = self.get_alert_count(node) sds_det = NS.sds_monitoring_manager.get_node_summary(node) old_summary = NodeSummary(node_id=node, name='', status='', role='', cluster_name='', cpu_usage={ 'percent_used': '', 'updated_at': '' }, memory_usage={ 'percent_used': '', 'updated_at': '', 'used': '', 'total': '' }, storage_usage={ 'percent_used': '', 'total': '', 'used': '', 'updated_at': '' }, swap_usage={ 'percent_used': '', 'updated_at': '', 'used': '', 'total': '' }, sds_det={}, alert_count=alert_count) try: old_summary = old_summary.load() except EtcdKeyNotFound: pass except (EtcdConnectionFailed, Exception) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch previously computed ' 'summary from etcd.', "exception": ex })) return if cpu_usage is None: cpu_usage = old_summary.cpu_usage if memory_usage is None: memory_usage = old_summary.memory_usage if storage_usage is None: storage_usage = old_summary.storage_usage if swap_usage is None: swap_usage = old_summary.swap_usage try: summary = NodeSummary( name=central_store_util.get_node_name_from_id(node), node_id=node, status=self.get_node_status(node), role=central_store_util.get_node_role(node), cluster_name=central_store_util.get_node_cluster_name(node), cpu_usage=cpu_usage, memory_usage=memory_usage, storage_usage=storage_usage, swap_usage=swap_usage, selinux_mode=central_store_util.get_node_selinux_mode(node), sds_det=sds_det, alert_count=alert_count) summary.save(update=False) except Exception as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Exception caught while trying to ' 'save summary for node %s' % str(node), "exception": ex }))
def get_node_brick_status_counts(self, node_id): brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: node_name = central_store_util.get_node_name_from_id(node_id) except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching node name for node " "%s" % node_id, "exception": ex } ) ) return brick_status_wise_counts try: ip_indexes = etcd_read_key('/indexes/ip') except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching ip indexes", "exception": ex } ) ) return brick_status_wise_counts node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip break try: cluster_id = central_store_util.get_node_cluster_id( node_id ) if cluster_id: bricks = self.get_cluster_bricks(cluster_id) for brick_path, brick_det in bricks.iteritems(): if ( brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip ): if ( 'status' in brick_det and brick_det['status'] == 'Stopped' ): brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id ) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.WARNING_ALERTS ] = count except ( TendrlPerformanceMonitoringException, AttributeError, ValueError, KeyError ) as ex: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex } ) ) return brick_status_wise_counts