def get_latest_stat(node, resource): try: stats = NS.time_series_db_manager.get_plugin().get_metric_stats( node, resource, 'latest') if stats == "[]" or not stats: raise TendrlPerformanceMonitoringException( 'Stats not yet available in time series db') stat = re.search('Current:(.+?)Max', stats) if not stat: raise TendrlPerformanceMonitoringException( 'Failed to get latest stat of %s of node %s for summary' 'Error: Current utilization not found' % (resource, node)) stat = re.search('Current:(.+?)Max', stats).group(1) if math.isnan(float(stat)): raise TendrlPerformanceMonitoringException( 'Received nan for utilization %s of %s' % (resource, node)) return float(stat) except (ValueError, urllib3.exceptions.HTTPError, TendrlPerformanceMonitoringException) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to get latest stat of %s of ' 'node %s for node summary.' % (resource, node), "exception": ex })) raise ex
def get_aggregated_stats(self, aggregation_type, entity_names, metric_name, time_interval=None, start_time=None, end_time=None): target = '' for entity_name in entity_names: target = '%s%s.%s.%s,' % (target, self.prefix, entity_name.replace('.', '_'), metric_name) target = target[:-1] if aggregation_type == pm_consts.AVERAGE: target = 'averageSeries(%s)' % target if time_interval: if time_interval == 'latest': target = "cactiStyle(%s)" % target else: start_time = self.parse_time(time_interval) if start_time: start_time = self.parse_time(start_time) if end_time: end_time = self.parse_time(end_time) url = 'http://%s:%s/render?target=%s&format=json' % ( self.host, str(self.port), target) if start_time: url = "%s&from=%s" % (url, start_time) if end_time: url = "%s&until=%s" % (url, end_time) try: stats = self.http.request('GET', url, timeout=5) if stats.status == 200: # TODO(Anmol): remove nulls from graphite data before returning # data. Explore the possibility of achieving this using some # tuning factor in graphite. data = re.sub('\[null, [0-9]+\], ', '', stats.data) data = re.sub(', \[null, [0-9]+\]', '', data) return data else: TendrlPerformanceMonitoringException( 'Request status code: %s' % str(stats.status)) except (ValueError, Exception) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch stats for metric %s' ' of %s using url. %s' % (metric_name, entity_name, url), "exception": ex })) raise TendrlPerformanceMonitoringException(str(ex))
def get_node_disk_iops_stats(self, node_id, time_interval=None, start_time=None, end_time=None): node_name = central_store_util.get_node_name_from_id(node_id) node_name = node_name.replace('.', '_') target = Template( 'sumSeries(averageSeries($prefix.$node_name.disk-*.disk_ops.write' '), averageSeries($prefix.$node_name.disk-*.disk_ops.read))' ).substitute( prefix=self.prefix, node_name=node_name, ) target = urllib.quote(target) if time_interval: if time_interval == 'latest': target = "cactiStyle(%s)" % target else: start_time = self.parse_time(time_interval) if start_time: start_time = self.parse_time(start_time) if end_time: end_time = self.parse_time(end_time) url = 'http://%s:%s/render?target=%s&format=json' % ( self.host, str(self.port), target) if start_time: url = "%s&from=%s" % (url, start_time) if end_time: url = "%s&until=%s" % (url, end_time) try: stats = self.http.request('GET', url, timeout=5) if stats.status == 200: # TODO(Anmol): remove nulls from graphite data before returning # data. Explore the possibility of achieving this using some # tuning factor in graphite. data = re.sub('\[null, [0-9]+\], ', '', stats.data) data = re.sub(', \[null, [0-9]+\]', '', data) return data else: TendrlPerformanceMonitoringException( 'Request status code: %s' % str(stats.status)) except (ValueError, Exception) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch %s stats using url %s' '. Error %s' % (target, url), "exception": ex })) raise TendrlPerformanceMonitoringException(str(ex))
def get_cluster_summary(cluster_id): try: summary = ClusterSummary(cluster_id=cluster_id) if not summary.exists(): raise TendrlPerformanceMonitoringException( "No summary found for cluster %s" % cluster_id) summary = summary.load().to_json() for key, value in summary.items(): if (key.startswith("_") or key in ['hash', 'updated_at', 'value', 'list']): del summary[key] return summary except Exception as ex: raise TendrlPerformanceMonitoringException(str(ex))
def get_system_summary(cluster_type): try: summary = SystemSummary(sds_type=cluster_type) if not summary.exists(): raise TendrlPerformanceMonitoringException( "No clusters of type %s found" % cluster_type) summary = summary.load().to_json() for key, value in summary.items(): if (key.startswith("_") or key in ['hash', 'updated_at', 'value', 'list']): del summary[key] return summary except Exception as ex: raise TendrlPerformanceMonitoringException(str(ex))
def get_most_used_bricks(self, bricks): brick_utilizations = [] if not bricks: return brick_utilizations for brick_path, brick_det in bricks.iteritems(): if ( 'utilization' in brick_det and 'used_percent' in brick_det['utilization'] ): brick_utilizations.append(brick_det['utilization']) else: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "No utilization info for brick " "%s" % brick_path, "exception": TendrlPerformanceMonitoringException( 'No utilization info for brick %s' % brick_path ) } ) ) brick_utilizations = sorted( brick_utilizations, key=lambda k: k['used_percent'] ) brick_utilizations.reverse() return brick_utilizations[:5]
def get_metrics(self, entity_name): url = 'http://%s:%s/metrics/index.json' % (self.host, str(self.port)) try: gevent.sleep(5) resp = self.http.request('GET', url, timeout=5) if resp.status != 200: raise TendrlPerformanceMonitoringException( 'Request status code: %s' % str(resp.status_code)) data = resp.data metrics = ast.literal_eval(data) result = [] prefix = "%s.%s." % (self.prefix, entity_name.replace('.', '_')) split_metrics = [] for metric in metrics: if metric.startswith(prefix): split_metrics = metric.split(prefix) result.append(split_metrics[1]) return str(result) except (ValueError, Exception) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to get metrics for %s.' % entity_name, "exception": ex }))
def get_node_name_from_id(node_id): try: node_name_path = '/nodes/%s/NodeContext/fqdn' % node_id return NS._int.client.read(node_name_path).value except (EtcdKeyNotFound, EtcdConnectionFailed, ValueError, SyntaxError, EtcdException, TypeError) as ex: raise TendrlPerformanceMonitoringException(str(ex))
def initiate_config_generation(node_det): try: job_params = { 'node_ids': [node_det.get('node_id')], "run": 'node_monitoring.flows.ConfigureCollectd', 'type': 'monitoring', "parameters": { 'plugin_name': node_det['plugin'], 'plugin_conf_params': json.dumps(node_det['plugin_conf']).encode('utf-8'), 'Node.fqdn': node_det['fqdn'], 'Service.name': 'collectd', }, } Job( job_id=str(uuid.uuid4()), status='new', payload=job_params, ).save() except (EtcdException, EtcdConnectionFailed, Exception) as ex: raise TendrlPerformanceMonitoringException( 'Failed to intiate monitoring configuration for plugin \ %s on %s with parameters %s.Error %s' % (node_det['plugin'], node_det['fqdn'], json.dumps(node_det['plugin_conf']), str(ex)))
def get_latest_stats(node, resource): try: node_name = central_store_util.get_node_name_from_id(node) stats = NS.time_series_db_manager.get_plugin().get_metric_stats( node_name, resource, 'latest' ) if stats == "[]" or not stats: raise TendrlPerformanceMonitoringException( 'Stats not yet available in time series db' ) return re.findall('Current:(.+?)Max', stats) except TendrlPerformanceMonitoringException as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": 'Failed to get latest stats of %s of ' 'node %s for node summary.' % (resource, node), "exception": ex } ) ) raise ex
def get_node_summary(): try: # Only 1 filter that is the node list is the only supported filter # anything else is simply ignored. summary = [] ret_code = 200 exs = '' is_filter = (len(request.args) == 1 and request.args.items()[0][0] == 'node_ids') if is_filter: node_list = (request.args.items()[0][1]).split(",") for index, node in enumerate(node_list): uuid_string = node_list[index].strip() if UUID(uuid_string, version=4).hex == uuid_string.replace('-', ''): node_list[index] = node_list[index].strip() else: raise TendrlPerformanceMonitoringException( 'Node id %s in the parameter is not a valid uuid' % (uuid_string)) summary, ret_code, exs = \ central_store_util.get_node_summary(node_list) else: summary, ret_code, exs = \ central_store_util.get_node_summary() return Response(json.dumps(summary), status=ret_code, mimetype='application/json') except (etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, ValueError, SyntaxError, etcd.EtcdException, TendrlPerformanceMonitoringException, TypeError) as ex: return Response(str(ex), status=500, mimetype='application/json')
def get_node_role(node_id): try: return NS._int.client.read('/nodes/%s/NodeContext/tags' % node_id).value except Exception as ex: raise TendrlPerformanceMonitoringException( "Failed to fetch the role of node %s. Error %s" % (node_id, str(ex)))
def get_node_cluster_name(node_id): try: return NS._int.client.read('/nodes/%s/TendrlContext/cluster_name' % node_id).value except Exception as ex: raise TendrlPerformanceMonitoringException( "Failed to fetch cluster name for node %s. Error: %s" % (node_id, str(ex)))
def get_node_ids(self): try: node_ids = [] nodes_etcd = tendrl_ns.etcd_orm.client.read('/nodes') for node in nodes_etcd._children: node_ids.append(node['key'][len('/nodes/'):]) return node_ids except EtcdKeyNotFound: return [] except (EtcdConnectionFailed, ValueError, SyntaxError, TypeError) as ex: raise TendrlPerformanceMonitoringException(str(ex))
def get_metric_stats(self, entity_name, metric_name, time_interval=None): metric_name = '%s.%s' % (entity_name.replace('.', '_'), metric_name) target = '%s.%s' % (self.prefix, metric_name) if time_interval == 'latest': target = "cactiStyle(%s)" % target url = 'http://%s:%s/render?target=%s&format=json' % ( self.host, str(self.port), target) try: stats = self.http.request('GET', url, timeout=5) if stats.status == 200: return stats.data else: TendrlPerformanceMonitoringException( 'Request status code: %s' % str( data.status_code ) ) except (ValueError, Exception) as ex: LOG.error('Failed to fetch stats for metric %s of %s using url %s.Error %s ' % ( metric_name, entity_name, url, str(ex)), exc_info=True) raise TendrlPerformanceMonitoringException(str(ex))
def get_cluster_ids(): try: cluster_ids = [] clusters_etcd = NS._int.client.read('/clusters') for cluster in clusters_etcd.leaves: cluster_key_contents = cluster.key.split('/') if len(cluster_key_contents) == 3: cluster_ids.append(cluster_key_contents[2]) return cluster_ids except EtcdKeyNotFound: return [] except (EtcdConnectionFailed, ValueError, SyntaxError, TypeError) as ex: raise TendrlPerformanceMonitoringException(str(ex))
def get_node_alert_ids(node_id=None): alert_ids = [] try: alerts = NS._int.client.read('/alerting/nodes/%s' % node_id) for alert in alerts.leaves: key_contents = alert.key.split('/') if len(key_contents) == 5: alert_ids.append(key_contents[4]) except EtcdKeyNotFound as ex: return alert_ids except (EtcdConnectionFailed, EtcdException) as ex: raise TendrlPerformanceMonitoringException(str(ex)) return alert_ids
def get_node_ids(): try: node_ids = [] nodes_etcd = NS._int.client.read('/nodes') for node in nodes_etcd.leaves: node_key_contents = node.key.split('/') if len(node_key_contents) == 3: node_ids.append(node_key_contents[2]) return node_ids except EtcdKeyNotFound: return [] except (EtcdConnectionFailed, ValueError, SyntaxError, TypeError) as ex: raise TendrlPerformanceMonitoringException(str(ex))
def get_sdsthroughput(sds_type, network_type): try: start_time = None end_time = None time_interval = None if len(request.args.items()) > 0: for request_param in request.args.items(): if request_param[0] == "start_time": start_time = request_param[1] elif request_param[0] == "end_time": end_time = request_param[1] elif request_param[0] == "interval": time_interval = request_param[1] # validate sds-type if sds_type not in NS.sds_monitoring_manager.supported_sds: raise TendrlPerformanceMonitoringException( 'Unsupported sds %s' % sds_type ) entity_name, metric_name = NS.time_series_db_manager.\ get_timeseriesnamefromresource( sds_type=sds_type, network_type=network_type, resource_name=pm_consts.SYSTEM_THROUGHPUT, utilization_type=pm_consts.USED ).split( NS.time_series_db_manager.get_plugin().get_delimeter(), 1 ) return Response( NS.time_series_db_manager.\ get_plugin().\ get_metric_stats( entity_name, metric_name, time_interval=time_interval, start_time=start_time, end_time=end_time ), status=200, mimetype='application/json' ) except ( ValueError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, SyntaxError, etcd.EtcdException, TypeError, TendrlPerformanceMonitoringException ) as ex: return Response(str(ex), status=500, mimetype='application/json')
def get_sdsutilization(sds_type, utiliation_type): try: start_time = None end_time = None time_interval = None if len(request.args.items()) > 0: for request_param in request.args.items(): if request_param[0] == "start_time": start_time = request_param[1] elif request_param[0] == "end_time": end_time = request_param[1] elif request_param[0] == "interval": time_interval = request_param[1] # validate sds-type if sds_type not in NS.sds_monitoring_manager.supported_sds: raise TendrlPerformanceMonitoringException( 'Unsupported sds %s' % sds_type ) entity_name, metric_name = NS.time_series_db_manager.\ get_timeseriesnamefromresource( resource_name=pm_consts.SYSTEM_UTILIZATION, utilization_type=utiliation_type, sds_type=sds_type ).split( NS.time_series_db_manager.get_plugin().get_delimeter(), 1 ) return Response( NS.time_series_db_manager.\ get_plugin().\ get_metric_stats( entity_name, metric_name, time_interval=time_interval, start_time=start_time, end_time=end_time ), status=200, mimetype='application/json' ) except ( AttributeError, ValueError, etcd.EtcdException, SyntaxError, urllib3.exceptions.HTTPError, TypeError, TendrlPerformanceMonitoringException ) as ex: return Response(str(ex), status=500, mimetype='application/json')
def get_system_summary(cluster_type): try: if cluster_type not in NS.sds_monitoring_manager.supported_sds: raise TendrlPerformanceMonitoringException('Unsupported sds %s' % cluster_type) summary = central_store_util.get_system_summary(cluster_type) return Response(json.dumps(summary), status=200, mimetype='application/json') except TendrlPerformanceMonitoringException as ex: return Response('Failed to fetch %s system summary.Error %s' % (cluster_type, str(ex)), status=500, mimetype='application/json')
def get_clusters_iops(): try: cluster_list = None start_time = None end_time = None time_interval = None if len(request.args.items()) > 0: for request_param in request.args.items(): if request_param[0] == "start_time": start_time = request_param[1] elif request_param[0] == "end_time": end_time = request_param[1] elif request_param[0] == "interval": time_interval = request_param[1] elif request_param[0] == "cluster_ids": cluster_list = (request.args.items()[0][1]).split(",") iops = [] ret_code = 200 exs = '' if cluster_list: for index, node in enumerate(cluster_list): uuid_string = cluster_list[index].strip() if UUID(uuid_string, version=4).hex == uuid_string.replace('-', ''): cluster_list[index] = cluster_list[index].strip() else: raise TendrlPerformanceMonitoringException( 'Cluster id %s in the parameter is not a valid ' 'uuid' % (uuid_string)) iops, ret_code, exs = \ central_store_util.get_cluster_iops( cluster_list, time_interval=time_interval, start_time=start_time, end_time=end_time ) else: iops, ret_code, exs = \ central_store_util.get_cluster_iops( time_interval=time_interval, start_time=start_time, end_time=end_time ) return Response(json.dumps(iops), status=ret_code, mimetype='application/json') except (etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, ValueError, SyntaxError, etcd.EtcdException, TendrlPerformanceMonitoringException, TypeError) as ex: return Response(str(ex), status=500, mimetype='application/json')
def get_configs(self): # TODO(Anmol) : Attempt reading: # /_tendrl/config/performance_monitoring/clusters/{cluster-id} and if # not already present, default back to defaults in: # /_tendrl/config/performance_monitoring try: configs = '' conf = tendrl_ns.etcd_orm.client.read( '/_tendrl/config/performance_monitoring') configs = conf.value return yaml.safe_load(configs) except (EtcdKeyNotFound, EtcdConnectionFailed, ValueError, SyntaxError, EtcdException) as ex: LOG.error('Fetching monitoring configurations failed. Error %s' % ex) raise TendrlPerformanceMonitoringException(str(ex))
def get_nodes_details(): nodes_dets = [] try: nodes = NS._int.client.read('/nodes/') for node in nodes.leaves: if node.key.startswith('/nodes/'): node_id = (node.key.split('/')[2]).encode('ascii', 'ignore') fqdn = (NS._int.client.read('/nodes/%s/NodeContext/fqdn' % (node_id)).value).encode( 'ascii', 'ignore') nodes_dets.append({'node_id': node_id, 'fqdn': fqdn}) return nodes_dets except EtcdKeyNotFound: return nodes_dets except EtcdConnectionFailed as ex: raise TendrlPerformanceMonitoringException(str(ex))
def get_nodes_details(self): nodes_dets = [] try: nodes = tendrl_ns.etcd_orm.client.read('/nodes/', recursive=True) for node in nodes._children: if node['key'].startswith('/nodes/'): node_id = (node['key'][len('/nodes/'):]).encode( 'ascii', 'ignore') fqdn = (tendrl_ns.etcd_orm.client.read( '%s/NodeContext/fqdn' % (node['key']), recursive=True).value).encode('ascii', 'ignore') nodes_dets.append({'node_id': node_id, 'fqdn': fqdn}) return nodes_dets except EtcdKeyNotFound: return nodes_dets except EtcdConnectionFailed as ex: raise TendrlPerformanceMonitoringException(str(ex))
def initiate_config_generation(node_det): try: plugin = NodeMonitoringPlugin( plugin_name=node_det['plugin'], node_id=node_det.get('node_id') ) if plugin.exists(): # More powers like fixed retrials can be added here.This is common # point through which all monitoring plugin configuration jobs land # into etcd and hence any action here is reflected to all of them. return job_params = { 'node_ids': [node_det.get('node_id')], "run": 'node_monitoring.flows.ConfigureCollectd', 'type': 'monitoring', "parameters": { 'plugin_name': node_det['plugin'], 'plugin_conf_params': json.dumps( node_det['plugin_conf'] ).encode('utf-8'), 'Node.fqdn': node_det['fqdn'], 'Service.name': 'collectd', }, } job_id = str(uuid.uuid4()) Job( job_id=job_id, status='new', payload=job_params, ).save() NodeMonitoringPlugin( plugin_name=node_det['plugin'], node_id=node_det.get('node_id'), job_id=job_id ).save(update=False) except (EtcdException, EtcdConnectionFailed, Exception) as ex: raise TendrlPerformanceMonitoringException( 'Failed to intiate monitoring configuration for plugin \ %s on %s with parameters %s.Error %s' % ( node_det['plugin'], node_det['fqdn'], json.dumps(node_det['plugin_conf']), str(ex) ) )
def get_configs(): # TODO(Anmol) : Attempt reading: # /_tendrl/config/performance_monitoring/clusters/{cluster-id} and if # not already present, default back to defaults in: # /_tendrl/config/performance_monitoring try: configs = '' conf = NS._int.client.read('_NS/performance_monitoring/config') configs = conf.value return yaml.safe_load(configs) except (EtcdKeyNotFound, EtcdConnectionFailed, ValueError, SyntaxError, EtcdException) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Fetching monitoring configurations failed.', "exception": ex })) raise TendrlPerformanceMonitoringException(str(ex))
def get_timeseriesnamefromresource(self, **kwargs): # If in future this function starts to appear more plugin # specific move it from here to respecive TimeSeriesDBPlugin delimeter = self.get_plugin().get_delimeter() resource_name = kwargs['resource_name'] if 'utilization_type' in kwargs: kwargs['utilization_type'] = self.get_plugin().get_utilizationtype( resource_name, kwargs['utilization_type'] ) pattern = { pm_consts.SYSTEM_UTILIZATION: '$sds_type{0}utilization{0}' '$utilization_type', pm_consts.CLUSTER_UTILIZATION: 'cluster_$cluster_id{0}' 'cluster_utilization{0}$utilization_type', pm_consts.CLUSTER_THROUGHPUT: 'cluster_$cluster_id{0}' 'throughput{0}$network_type{0}$utilization_type', pm_consts.SYSTEM_THROUGHPUT: '$sds_type{0}' 'throughput{0}$network_type{0}$utilization_type', pm_consts.NODE_THROUGHPUT: '$node_name{0}' 'network_throughput-$network_type{0}$utilization_type', pm_consts.LATENCY: 'ping{0}ping-$underscored_monitoring_node_name', pm_consts.IOPS: 'cluster_$cluster_id{0}cluster_iops_read_write{0}' '$utilization_type', pm_consts.SWAP: 'swap{0}$utilization_type', pm_consts.SWAP_TOTAL: '$utilization_type', pm_consts.CPU: '$underscored_node_name{0}cpu{0}cpu_system_user{0}' '$utilization_type', pm_consts.STORAGE: '$underscored_node_name{0}storage{0}' '$utilization_type', pm_consts.CLUSTER_IOPS: 'cluster_$cluster_id{0}' 'cluster_iops_read_write{0}gauge-total' } if not pattern.get(resource_name): raise TendrlPerformanceMonitoringException( 'No pattern found for the requested resource %s.' ) return Template( pattern.get(resource_name).format(delimeter) ).substitute(kwargs)
def get_metrics(self, entity_name): url = 'http://%s:%s/metrics/index.json' % (self.host, str(self.port)) try: time.sleep(5) resp = self.http.request('GET', url, timeout=5) if resp.status != 200: raise TendrlPerformanceMonitoringException( 'Request status code: %s' % str(resp.status_code) ) data = resp.data metrics = ast.literal_eval(data) result = [] prefix = "%s.%s." % (self.prefix, entity_name.replace('.', '_')) split_metrics = [] for metric in metrics: if metric.startswith(prefix): split_metrics = metric.split(prefix) result.append(split_metrics[1]) return str(result) except (ValueError, Exception) as ex: LOG.error('Failed to get metrics for %s.Error %s ' % (entity_name, ex), exc_info=True)
def initiate_config_generation(self, conf_name, data, node_det): try: job = { 'node_ids': [node_det.get('node_id')], "run": 'tendrl.node_monitoring.flows.configure_collectd.ConfigureCollectd', 'status': 'new', 'type': 'monitoring', 'integration_id': tendrl_ns.tendrl_context.integration_id, "parameters": { 'plugin_name': conf_name, 'plugin_conf_params': json.dumps(data), 'Node.fqdn': node_det['fqdn'], 'Service.name': 'collectd', }, } tendrl_ns.etcd_orm.client.write("/queue/%s" % str(uuid.uuid4()), json.dumps(job)) except (EtcdException, EtcdConnectionFailed, EtcdException) as ex: LOG.error('Failed to intiate monitoring configuration for plugin \ %s on %s with parameters %s.Error %s' % (conf_name, node_det['fqdn'], data, ex)) raise TendrlPerformanceMonitoringException(str(ex))