예제 #1
0
 def get_net_storage_utilization(self, node):
     try:
         used_stats = get_latest_stats(node, 'df-*.df_complex-used')
         used = 0.0
         for stat in used_stats:
             if not math.isnan(float(stat)):
                 used = used + float(stat)
         free_stats = get_latest_stats(node, 'df-*.df_complex-free')
         free = 0.0
         for stat in free_stats:
             if not math.isnan(float(stat)):
                 free = free + float(stat)
         if free + used == 0:
             return None
         percent_used = float(used * 100) / float(free + used)
         node_name = central_store_util.get_node_name_from_id(node)
         NS.time_series_db_manager.get_plugin().push_metrics(
             NS.time_series_db_manager.get_timeseriesnamefromresource(
                 underscored_node_name=node_name.replace('.', '_'),
                 resource_name=pm_consts.STORAGE,
                 utilization_type=pm_consts.PERCENT_USED), percent_used)
         return {
             'used': str(used),
             'total': str(used + free),
             'percent_used': str(percent_used),
             'updated_at': datetime.datetime.now().isoformat()
         }
     except TendrlPerformanceMonitoringException:
         # Exception already handled
         return None
예제 #2
0
def get_latest_node_stat(node, resource):
    try:
        node_name = central_store_util.get_node_name_from_id(node)
        return get_latest_stat(node_name, resource)
    except (ValueError, urllib3.exceptions.HTTPError,
            TendrlPerformanceMonitoringException) as ex:
        raise ex
예제 #3
0
def get_nodestats(node_id, resource_name):
    try:
        node_name = central_store_util.get_node_name_from_id(node_id)
        start_time = None
        end_time = None
        time_interval = None
        if len(request.args.items()) > 0:
            for request_param in request.args.items():
                if request_param[0] == "start_time":
                    start_time = request_param[1]
                elif request_param[0] == "end_time":
                    end_time = request_param[1]
                elif request_param[0] == "interval":
                    time_interval = request_param[1]
        return Response(
            NS.time_series_db_manager.\
            get_plugin().\
            get_metric_stats(
                node_name,
                resource_name,
                time_interval=time_interval,
                start_time=start_time,
                end_time=end_time
            ),
            status=200,
            mimetype='application/json'
        )
    except (ValueError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed,
            SyntaxError, etcd.EtcdException, TypeError,
            TendrlPerformanceMonitoringException) as ex:
        return Response(str(ex), status=500, mimetype='application/json')
예제 #4
0
def get_latest_stats(node, resource):
    try:
        node_name = central_store_util.get_node_name_from_id(node)
        stats = NS.time_series_db_manager.get_plugin().get_metric_stats(
            node_name,
            resource,
            'latest'
        )
        if stats == "[]" or not stats:
            raise TendrlPerformanceMonitoringException(
                'Stats not yet available in time series db'
            )
        return re.findall('Current:(.+?)Max', stats)
    except TendrlPerformanceMonitoringException as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={"message": 'Failed to get latest stats of %s of '
                                    'node %s for node summary.'
                                    % (resource, node),
                         "exception": ex
                         }
            )
        )
        raise ex
예제 #5
0
def get_latest_node_stat(node, resource):
    try:
        node_name = central_store_util.get_node_name_from_id(
            node
        )
        return get_latest_stat(node_name, resource)
    except TendrlPerformanceMonitoringException as ex:
        raise ex
예제 #6
0
def get_stat_types(node_id):
    try:
        node_name = central_store_util.get_node_name_from_id(node_id)
        return Response(
            NS.time_series_db_manager.get_plugin().get_metrics(node_name),
            status=200,
            mimetype='application/json')
    except (ValueError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed,
            SyntaxError, etcd.EtcdException, TypeError,
            TendrlPerformanceMonitoringException) as ex:
        return Response(str(ex), status=500, mimetype='application/json')
예제 #7
0
 def get_node_disk_iops_stats(self,
                              node_id,
                              time_interval=None,
                              start_time=None,
                              end_time=None):
     node_name = central_store_util.get_node_name_from_id(node_id)
     node_name = node_name.replace('.', '_')
     target = Template(
         'sumSeries(averageSeries($prefix.$node_name.disk-*.disk_ops.write'
         '), averageSeries($prefix.$node_name.disk-*.disk_ops.read))'
     ).substitute(
         prefix=self.prefix,
         node_name=node_name,
     )
     target = urllib.quote(target)
     if time_interval:
         if time_interval == 'latest':
             target = "cactiStyle(%s)" % target
         else:
             start_time = self.parse_time(time_interval)
     if start_time:
         start_time = self.parse_time(start_time)
     if end_time:
         end_time = self.parse_time(end_time)
     url = 'http://%s:%s/render?target=%s&format=json' % (
         self.host, str(self.port), target)
     if start_time:
         url = "%s&from=%s" % (url, start_time)
     if end_time:
         url = "%s&until=%s" % (url, end_time)
     try:
         stats = self.http.request('GET', url, timeout=5)
         if stats.status == 200:
             # TODO(Anmol): remove nulls from graphite data before returning
             # data. Explore the possibility of achieving this using some
             # tuning factor in graphite.
             data = re.sub('\[null, [0-9]+\], ', '', stats.data)
             data = re.sub(', \[null, [0-9]+\]', '', data)
             return data
         else:
             TendrlPerformanceMonitoringException(
                 'Request status code: %s' % str(stats.status))
     except (ValueError, Exception) as ex:
         Event(
             ExceptionMessage(priority="debug",
                              publisher=NS.publisher_id,
                              payload={
                                  "message":
                                  'Failed to fetch %s stats using url %s'
                                  '. Error %s' % (target, url),
                                  "exception":
                                  ex
                              }))
         raise TendrlPerformanceMonitoringException(str(ex))
예제 #8
0
 def get_node_brick_status_counts(self, node_id):
     node_name = central_store_util.get_node_name_from_id(node_id)
     ip_indexes = etcd_read_key('/indexes/ip')
     node_ip = ''
     for ip, indexed_node_id in ip_indexes.iteritems():
         if node_id == indexed_node_id:
             node_ip = ip
     brick_status_wise_counts = {
         'stopped': 0,
         'total': 0,
         pm_consts.WARNING_ALERTS: 0,
         pm_consts.CRITICAL_ALERTS: 0
     }
     try:
         cluster_id = central_store_util.get_node_cluster_id(node_id)
         if cluster_id:
             volumes_det = self.get_cluster_volumes(cluster_id)
             for volume_id, volume_det in volumes_det.iteritems():
                 for brick_path, brick_det in volume_det.get(
                         'Bricks', {}).iteritems():
                     if (brick_det['hostname'] == node_name
                             or brick_det['hostname'] == node_ip):
                         if brick_det['status'] == 'Stopped':
                             brick_status_wise_counts['stopped'] = \
                                 brick_status_wise_counts['stopped'] + 1
                         brick_status_wise_counts['total'] = \
                             brick_status_wise_counts['total'] + 1
         crit_alerts, warn_alerts = parse_resource_alerts(
             'brick', pm_consts.CLUSTER, cluster_id=cluster_id)
         count = 0
         for alert in crit_alerts:
             if alert['node_id'] == node_id:
                 count = count + 1
         brick_status_wise_counts[pm_consts.CRITICAL_ALERTS] = count
         count = 0
         for alert in warn_alerts:
             if alert['node_id'] == node_id:
                 count = count + 1
         brick_status_wise_counts[pm_consts.WARNING_ALERTS] = count
     except Exception as ex:
         Event(
             Message(priority="info",
                     publisher=NS.publisher_id,
                     payload={
                         "message": "Exception caught fetching node brick"
                         " status wise counts",
                         "exception": ex
                     }))
     return brick_status_wise_counts
예제 #9
0
 def get_net_host_cpu_utilization(self, node):
     try:
         percent_user = get_latest_node_stat(node, 'cpu.percent-user')
         percent_system = get_latest_node_stat(node, 'cpu.percent-system')
         node_name = central_store_util.get_node_name_from_id(node)
         NS.time_series_db_manager.get_plugin().push_metrics(
             NS.time_series_db_manager.get_timeseriesnamefromresource(
                 underscored_node_name=node_name.replace('.', '_'),
                 resource_name=pm_consts.CPU,
                 utilization_type=pm_consts.PERCENT_USED),
             percent_user + percent_system)
         return {
             'percent_used': str(percent_user + percent_system),
             'updated_at': datetime.datetime.now().isoformat()
         }
     except TendrlPerformanceMonitoringException:
         # Exception already handled
         return None
예제 #10
0
 def calculate_host_summary(self, node):
     gevent.sleep(0.1)
     cpu_usage = self.get_net_host_cpu_utilization(node)
     memory_usage = self.get_net_host_memory_utilization(node)
     storage_usage = self.get_net_storage_utilization(node)
     swap_usage = self.get_net_host_swap_utilization(node)
     alert_count = self.get_alert_count(node)
     sds_det = NS.sds_monitoring_manager.get_node_summary(node)
     old_summary = NodeSummary(node_id=node,
                               name='',
                               status='',
                               role='',
                               cluster_name='',
                               cpu_usage={
                                   'percent_used': '',
                                   'updated_at': ''
                               },
                               memory_usage={
                                   'percent_used': '',
                                   'updated_at': '',
                                   'used': '',
                                   'total': ''
                               },
                               storage_usage={
                                   'percent_used': '',
                                   'total': '',
                                   'used': '',
                                   'updated_at': ''
                               },
                               swap_usage={
                                   'percent_used': '',
                                   'updated_at': '',
                                   'used': '',
                                   'total': ''
                               },
                               sds_det={},
                               alert_count=alert_count)
     try:
         old_summary = old_summary.load()
     except EtcdKeyNotFound:
         pass
     except (EtcdConnectionFailed, Exception) as ex:
         Event(
             ExceptionMessage(priority="debug",
                              publisher=NS.publisher_id,
                              payload={
                                  "message":
                                  'Failed to fetch previously computed '
                                  'summary from etcd.',
                                  "exception":
                                  ex
                              }))
         return
     if cpu_usage is None:
         cpu_usage = old_summary.cpu_usage
     if memory_usage is None:
         memory_usage = old_summary.memory_usage
     if storage_usage is None:
         storage_usage = old_summary.storage_usage
     if swap_usage is None:
         swap_usage = old_summary.swap_usage
     try:
         summary = NodeSummary(
             name=central_store_util.get_node_name_from_id(node),
             node_id=node,
             status=self.get_node_status(node),
             role=central_store_util.get_node_role(node),
             cluster_name=central_store_util.get_node_cluster_name(node),
             cpu_usage=cpu_usage,
             memory_usage=memory_usage,
             storage_usage=storage_usage,
             swap_usage=swap_usage,
             selinux_mode=central_store_util.get_node_selinux_mode(node),
             sds_det=sds_det,
             alert_count=alert_count)
         summary.save(update=False)
     except Exception as ex:
         Event(
             ExceptionMessage(priority="debug",
                              publisher=NS.publisher_id,
                              payload={
                                  "message":
                                  'Exception caught while trying to '
                                  'save summary for node %s' % str(node),
                                  "exception":
                                  ex
                              }))
예제 #11
0
 def get_node_brick_status_counts(self, node_id):
     brick_status_wise_counts = {
         'stopped': 0,
         'total': 0,
         pm_consts.WARNING_ALERTS: 0,
         pm_consts.CRITICAL_ALERTS: 0
     }
     try:
         node_name = central_store_util.get_node_name_from_id(node_id)
     except EtcdKeyNotFound as ex:
         Event(
             ExceptionMessage(
                 priority="error",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Error fetching node name for node "
                     "%s" % node_id,
                     "exception": ex
                 }
             )
         )
         return brick_status_wise_counts
     try:
         ip_indexes = etcd_read_key('/indexes/ip')
     except EtcdKeyNotFound as ex:
         Event(
             ExceptionMessage(
                 priority="error",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Error fetching ip indexes",
                     "exception": ex
                 }
             )
         )
         return brick_status_wise_counts
     node_ip = ''
     for ip, indexed_node_id in ip_indexes.iteritems():
         if node_id == indexed_node_id:
             node_ip = ip
             break
     try:
         cluster_id = central_store_util.get_node_cluster_id(
             node_id
         )
         if cluster_id:
             bricks = self.get_cluster_bricks(cluster_id)
             for brick_path, brick_det in bricks.iteritems():
                 if (
                     brick_det['hostname'] == node_name or
                     brick_det['hostname'] == node_ip
                 ):
                     if (
                         'status' in brick_det and
                         brick_det['status'] == 'Stopped'
                     ):
                         brick_status_wise_counts['stopped'] = \
                             brick_status_wise_counts['stopped'] + 1
                     brick_status_wise_counts['total'] = \
                         brick_status_wise_counts['total'] + 1
         crit_alerts, warn_alerts = parse_resource_alerts(
             'brick',
             pm_consts.CLUSTER,
             cluster_id=cluster_id
         )
         count = 0
         for alert in crit_alerts:
             if alert['node_id'] == node_id:
                 count = count + 1
         brick_status_wise_counts[
             pm_consts.CRITICAL_ALERTS
         ] = count
         count = 0
         for alert in warn_alerts:
             if alert['node_id'] == node_id:
                 count = count + 1
         brick_status_wise_counts[
             pm_consts.WARNING_ALERTS
         ] = count
     except (
         TendrlPerformanceMonitoringException,
         AttributeError,
         ValueError,
         KeyError
     ) as ex:
         Event(
             Message(
                 priority="info",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Exception caught fetching node brick"
                     " status wise counts",
                     "exception": ex
                 }
             )
         )
     return brick_status_wise_counts