Exemplo n.º 1
0
    def spawn_alerter_process(self, alert, metric, context):
        """
        Spawn a process to trigger an alert.

        This is used by smtp alerters so that matplotlib objects are cleared
        down and the alerter cannot create a memory leak in this manner and
        plt.savefig keeps the object in memory until the process terminates.
        Seeing as data is being surfaced and processed in the alert_smtp
        context, multiprocessing the alert creation and handling prevents any
        memory leaks in the parent.

        Added 20160814 relating to:

        * Bug #1558: Memory leak in Analyzer
        * Issue #21 Memory leak in Analyzer see https://github.com/earthgecko/skyline/issues/21

        Parameters as per :py:func:`skyline.analyzer.alerters.trigger_alert
        <analyzer.alerters.trigger_alert>`

        """

        trigger_alert(alert, metric, context)
Exemplo n.º 2
0
parser = OptionParser()
parser.add_option("-t", "--trigger", dest="trigger", default=False,
                  help="Actually trigger the appropriate alerts (default is False)")

parser.add_option("-m", "--metric", dest="metric", default='skyline.horizon.queue_size',
                  help="Pass the metric to test (default is skyline.horizon.queue_size)")

(options, args) = parser.parse_args()

try:
    alerts_enabled = settings.ENABLE_ALERTS
    alerts = settings.ALERTS
except:
    print "Exception: Check your settings file for the existence of ENABLE_ALERTS and ALERTS"
    sys.exit()

print 'Verifying alerts for: "' + options.metric + '"'

# Send alerts
if alerts_enabled:
    for alert in alerts:
        if alert[0] in options.metric:
            print '    Testing against "' + alert[0] + '" to send via ' + alert[1] + "...triggered"
            if options.trigger:
                trigger_alert(alert, options.metric)
        else:
            print '    Testing against "' + alert[0] + '" to send via ' + alert[1] + "..."
else:
    print 'Alerts are disabled'
Exemplo n.º 3
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    'cloudbrain can\'t connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(
                    unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(
                self.redis_conn.smembers(settings.FULL_NAMESPACE +
                                         'unique_metrics'))

            pretty_unique_metrics = []  # metric without the FULL_NAMESPACE
            for metric in unique_metrics:
                pretty_unique_metrics.append(
                    metric.replace(settings.FULL_NAMESPACE, ""))

            # Write unique metrics to static webapp directory
            filename = path.abspath(
                path.join(path.dirname(__file__), '..',
                          settings.UNIQUE_METRICS))
            with open(filename, 'w') as fh:
                # Make it JSONP with a handle_data() function
                fh.write(json.dumps(pretty_unique_metrics))

            if len(unique_metrics) == 0:
                logger.info(
                    'no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info(
                        'WARNING: cloudbrain is set for more cores than needed.'
                    )
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        if alert[0] in metric[1]:
                            cache_key = 'last_alert.%s.%s' % (alert[1],
                                                              metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    self.redis_conn.setex(
                                        cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(alert, metric)

                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)
            '''
            # Write anomalous_metrics to static webapp directory
            filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
            with open(filename, 'w') as fh:
                anomalous_metrics.sort(key=operator.itemgetter(1))
                fh.write('handle_data(%s)' % anomalous_metrics)

            '''

            # read unique metrics
            filename = path.abspath(
                path.join(path.dirname(__file__), '..',
                          settings.UNIQUE_METRICS))
            with open(filename, 'r') as f:
                json_data = f.read()
                unique_metrics = json.loads(json_data)

                anomalous_metrics_names = [
                    metric[1] for metric in self.anomalous_metrics
                ]
                for metric in unique_metrics:
                    if not metric in anomalous_metrics_names:
                        self.anomalous_metrics.append([0, str(metric)])

                # Write anomalous_metrics to static webapp directory
                filename = path.abspath(
                    path.join(path.dirname(__file__), '..',
                              settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % repr(anomalous_metrics))

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' %
                        (len(unique_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' %
                        len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            self.send_graphite_metric('cloudbrain.analyzer.run_time',
                                      '%.2f' % (time() - now))
            self.send_graphite_metric(
                'cloudbrain.analyzer.total_analyzed',
                '%.2f' % (len(unique_metrics) - sum(exceptions.values())))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE +
                                             settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                self.send_graphite_metric('cloudbrain.analyzer.duration',
                                          '%.2f' % time_human)
                self.send_graphite_metric('cloudbrain.analyzer.projected',
                                          '%.2f' % projected)

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
Exemplo n.º 4
0
    syslog_enabled = settings.SYSLOG_ENABLED
except:
    syslog_enabled = False

print('Verifying alerts for: "' + options.metric + '"')

# Send alerts
context = 'Analyzer'
if alerts_enabled:
    print('Testing Analyzer alerting on ' + options.metric)
    for alert in alerts:
        if alert[0] == options.metric:
            if options.trigger in alert[1]:
                print('    Testing Analyzer alerting - against "' + alert[0] + '" to send via ' + alert[1] + "...triggered")
                metric = (0, options.metric, 12345)
                trigger_alert(alert, metric, context)
                if syslog_enabled:
                    print('    Testing Analyzer alerting - against "' + alert[0] + '" to send via syslog ' + "...triggered")
                    alert = (alert[0], 'syslog')
                    trigger_alert(alert, metric, context)
else:
    print('Alerts are disabled')

# Mirage alerts
try:
    mirage_enabled = settings.ENABLE_MIRAGE
    mirage_alerts_enabled = settings.MIRAGE_ENABLE_ALERTS
except:
    mirage_alerts_enabled = False

if mirage_alerts_enabled:
Exemplo n.º 5
0
    def run(self):
        """
        - Called when the process intializes.

        - Determine if Redis is up and discover the number of `unique metrics`.

        - Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES`
          and assign each process a set of metrics to analyse for anomalies.

        - Wait for the processes to finish.

        - Determine whether if any anomalous metrics require:

            - Alerting on (and set `EXPIRATION_TIME` key in Redis for alert).
            - Feed to another module e.g. mirage.
            - Alert to syslog.

        - Populate the webapp json with the anomalous_metrics details.

        - Log the details about the run to the skyline analyzer log.

        - Send skyline.analyzer metrics to `GRAPHITE_HOST`
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' % skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' % skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        if not os.path.exists(settings.SKYLINE_TMP_DIR):
            if python_version == 2:
                mode_arg = int('0755')
            if python_version == 3:
                mode_arg = mode=0o755
            os.makedirs(settings.SKYLINE_TMP_DIR, mode_arg)

        # Initiate the algorithm timings if Analyzer is configured to send the
        # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS
        algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.'
        algorithms_to_time = []
        if send_algorithm_run_metrics:
            algorithms_to_time = settings.ALGORITHMS

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Report app up
            self.redis_conn.setex(skyline_app, 120, now)

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Using count files rather that multiprocessing.Value to enable metrics for
            # metrics for algorithm run times, etc
            for algorithm in algorithms_to_time:
                algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count'
                algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings'
                # with open(algorithm_count_file, 'a') as f:
                with open(algorithm_count_file, 'w') as f:
                    pass
                with open(algorithm_timings_file, 'w') as f:
                    pass

            # Remove any existing algorithm.error files from any previous runs
            # that did not cleanup for any reason
            pattern = '%s.*.algorithm.error' % skyline_app
            try:
                for f in os.listdir(settings.SKYLINE_TMP_DIR):
                    if re.search(pattern, f):
                        try:
                            os.remove(os.path.join(settings.SKYLINE_TMP_DIR, f))
                            logger.info('cleaning up old error file - %s' % (str(f)))
                        except OSError:
                            pass
            except:
                logger.error('failed to cleanup algorithm.error files ' + traceback.format_exc())

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                pid_count += 1
                logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES)))
                p.start()
                spawned_pids.append(p.pid)

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # Self monitor processes and terminate if any spin_process has run
            # for longer than 180 seconds - 20160512 @earthgecko
            p_starts = time()
            while time() - p_starts <= settings.MAX_ANALYZER_PROCESS_RUNTIME:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.ANALYZER_PROCESSES), time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app))
                for p in pids:
                    p.terminate()
                    p.join()

            # Log the last reported error by any algorithms that errored in the
            # spawned processes from algorithms.py
            for completed_pid in spawned_pids:
                logger.info('spin_process with pid %s completed' % (str(completed_pid)))
                for algorithm in settings.ALGORITHMS:
                    algorithm_error_file = '%s/%s.%s.%s.algorithm.error' % (
                        settings.SKYLINE_TMP_DIR, skyline_app,
                        str(completed_pid), algorithm)
                    if os.path.isfile(algorithm_error_file):
                        logger.info(
                            'error - spin_process with pid %s has reported an error with the %s algorithm' % (
                                str(completed_pid), algorithm))
                        try:
                            with open(algorithm_error_file, 'r') as f:
                                error_string = f.read()
                            logger.error('%s' % str(error_string))
                        except:
                            logger.error('failed to read %s error file' % algorithm)
                        try:
                            os.remove(algorithm_error_file)
                        except OSError:
                            pass

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        ALERT_MATCH_PATTERN = alert[0]
                        METRIC_PATTERN = metric[1]
                        alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                        pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                        if pattern_match:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    try:
                                        SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3]
                                        logger.info('mirage check      :: %s' % (metric[1]))
                                        # Write anomalous metric to test at second
                                        # order resolution by crucible to the check
                                        # file
                                        metric_timestamp = int(time())
                                        anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1])
                                        with open(anomaly_check_file, 'w') as fh:
                                            # metric_name, anomalous datapoint, hours to resolve, timestamp
                                            fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp))
                                        if python_version == 2:
                                            mode_arg = int('0644')
                                        if python_version == 3:
                                            mode_arg = '0o644'
                                        os.chmod(anomaly_check_file, mode_arg)

                                        logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3]))
                                        # Add to the mirage_metrics list
                                        base_name = METRIC_PATTERN.replace(settings.FULL_NAMESPACE, '', 1)
                                        metric = [metric[0], base_name]
                                        self.mirage_metrics.append(metric)
                                        # Alert for analyzer if enabled
                                        if settings.ENABLE_FULL_DURATION_ALERTS:
                                            self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                            trigger_alert(alert, metric)
                                    except:
                                        self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                        trigger_alert(alert, metric)
                            except Exception as e:
                                logger.error('error :: could not send alert: %s' % e)

            # Push to crucible
#            if len(self.crucible_anomalous_metrics) > 0:
#                logger.info('to do - push to crucible')

            # Write anomalous_metrics to static webapp directory
            if len(self.anomalous_metrics) > 0:
                filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % anomalous_metrics)

            # Using count files rather that multiprocessing.Value to enable metrics for
            # metrics for algorithm run times, etc
            for algorithm in algorithms_to_time:
                algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count'
                algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings'

                try:
                    algorithm_count_array = []
                    with open(algorithm_count_file, 'r') as f:
                        for line in f:
                            value_string = line.replace('\n', '')
                            unquoted_value_string = value_string.replace("'", '')
                            float_value = float(unquoted_value_string)
                            algorithm_count_array.append(float_value)
                except:
                    algorithm_count_array = False

                if not algorithm_count_array:
                    continue

                number_of_times_algorithm_run = len(algorithm_count_array)
                logger.info(
                    'algorithm run count - %s run %s times' % (
                        algorithm, str(number_of_times_algorithm_run)))
                if number_of_times_algorithm_run == 0:
                    continue

                try:
                    algorithm_timings_array = []
                    with open(algorithm_timings_file, 'r') as f:
                        for line in f:
                            value_string = line.replace('\n', '')
                            unquoted_value_string = value_string.replace("'", '')
                            float_value = float(unquoted_value_string)
                            algorithm_timings_array.append(float_value)
                except:
                    algorithm_timings_array = False

                if not algorithm_timings_array:
                    continue

                number_of_algorithm_timings = len(algorithm_timings_array)
                logger.info(
                    'algorithm timings count - %s has %s timings' % (
                        algorithm, str(number_of_algorithm_timings)))

                if number_of_algorithm_timings == 0:
                    continue

                try:
                    _sum_of_algorithm_timings = sum(algorithm_timings_array)
                except:
                    logger.error("sum error: " + traceback.format_exc())
                    _sum_of_algorithm_timings = round(0.0, 6)
                    logger.error('error - sum_of_algorithm_timings - %s' % (algorithm))
                    continue

                sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6)
                # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings))

                try:
                    _median_algorithm_timing = determine_median(algorithm_timings_array)
                except:
                    _median_algorithm_timing = round(0.0, 6)
                    logger.error('error - _median_algorithm_timing - %s' % (algorithm))
                    continue
                median_algorithm_timing = round(_median_algorithm_timing, 6)
                # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing))

                logger.info(
                    'algorithm timing - %s - total: %.6f - median: %.6f' % (
                        algorithm, sum_of_algorithm_timings,
                        median_algorithm_timing))
                use_namespace = skyline_app_graphite_namespace + '.algorithm_breakdown.' + algorithm
                send_metric_name = use_namespace + '.timing.times_run'
                send_graphite_metric(skyline_app, send_metric_name, str(number_of_algorithm_timings))
                send_metric_name = use_namespace + '.timing.total_time'
                send_graphite_metric(skyline_app, send_metric_name, str(sum_of_algorithm_timings))
                send_metric_name = use_namespace + '.timing.median_time'
                send_graphite_metric(skyline_app, send_metric_name, str(median_algorithm_timing))

            run_time = time() - now
            total_metrics = str(len(unique_metrics))
            total_analyzed = str(len(unique_metrics) - sum(exceptions.values()))
            total_anomalies = str(len(self.anomalous_metrics))

            # Log progress
            logger.info('seconds to run    :: %.2f' % run_time)
            logger.info('total metrics     :: %s' % total_metrics)
            logger.info('total analyzed    :: %s' % total_analyzed)
            logger.info('total anomalies   :: %s' % total_anomalies)
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            graphite_run_time = '%.2f' % run_time
            send_metric_name = skyline_app_graphite_namespace + '.run_time'
            send_graphite_metric(skyline_app, send_metric_name, graphite_run_time)

            send_metric_name = skyline_app_graphite_namespace + '.total_analyzed'
            send_graphite_metric(skyline_app, send_metric_name, total_analyzed)

            send_metric_name = skyline_app_graphite_namespace + '.total_anomalies'
            send_graphite_metric(skyline_app, send_metric_name, total_anomalies)

            send_metric_name = skyline_app_graphite_namespace + '.total_metrics'
            send_graphite_metric(skyline_app, send_metric_name, total_metrics)
            for key, value in exceptions.items():
                send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key)
                send_graphite_metric(skyline_app, send_metric_name, str(value))
            for key, value in anomaly_breakdown.items():
                send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key)
                send_graphite_metric(skyline_app, send_metric_name, str(value))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                send_metric_name = skyline_app_graphite_namespace + '.duration'
                send_graphite_metric(skyline_app, send_metric_name, str(time_human))

                send_metric_name = skyline_app_graphite_namespace + '.projected'
                send_graphite_metric(skyline_app, send_metric_name, str(projected))

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            # if time() - now < 5:
            #    logger.info('sleeping due to low run time...')
            #    sleep(10)
            # @modified 20160504 - @earthgecko - development internal ref #1338, #1340)
            # Etsy's original for this was a value of 5 seconds which does
            # not make skyline Analyzer very efficient in terms of installations
            # where 100s of 1000s of metrics are being analyzed.  This lead to
            # Analyzer running over several metrics multiple time in a minute
            # and always working.  Therefore this was changed from if you took
            # less than 5 seconds to run only then sleep.  This behaviour
            # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and
            # then doing it again and again in a single minute.  Therefore the
            # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to
            # self optimise in cases where skyline is NOT deployed to analyze
            # 100s of 1000s of metrics.  This relates to optimising performance
            # for any deployments in the few 1000s and 60 second resolution
            # area, e.g. smaller and local deployments.
            process_runtime = time() - now
            analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION
            if process_runtime < analyzer_optimum_run_duration:
                sleep_for = (analyzer_optimum_run_duration - process_runtime)
                logger.info('sleeping for %.2f seconds due to low run time...' % sleep_for)
                sleep(sleep_for)
Exemplo n.º 6
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        if alert[0] in metric[1]:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(alert, metric)

                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)

            # Write anomalous_metrics to static webapp directory
            filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
            with open(filename, 'w') as fh:
                # Make it JSONP with a handle_data() function
                anomalous_metrics = list(self.anomalous_metrics)
                anomalous_metrics.sort(key=operator.itemgetter(1))
                fh.write('handle_data(%s)' % anomalous_metrics)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' % (len(unique_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            self.send_graphite_metric('skyline.analyzer.run_time', '%.2f' % (time() - now))
            self.send_graphite_metric('skyline.analyzer.total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values())))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                self.send_graphite_metric('skyline.analyzer.duration', '%.2f' % time_human)
                self.send_graphite_metric('skyline.analyzer.projected', '%.2f' % projected)

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
Exemplo n.º 7
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Reset boundary_metrics
            boundary_metrics = []

            # Build boundary metrics
            for metric_name in unique_metrics:
                for metric in BOUNDARY_METRICS:
                    CHECK_MATCH_PATTERN = metric[0]
                    check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                    base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                    pattern_match = check_match_pattern.match(base_name)
                    if pattern_match:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - boundary metric - pattern MATCHED - " + metric[0] + " | " + base_name)
                        boundary_metrics.append([metric_name, metric[1]])

            if ENABLE_BOUNDARY_DEBUG:
                logger.info("debug - boundary metrics - " + str(boundary_metrics))

            if len(boundary_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.BOUNDARY_PROCESSES + 1):
                if i > len(boundary_metrics):
                    logger.info('WARNING: skyline boundary is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, boundary_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.BOUNDARY_ENABLE_ALERTS:
                for anomalous_metric in self.anomalous_metrics:
                    datapoint = str(anomalous_metric[0])
                    metric_name = anomalous_metric[1]
                    base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                    expiration_time = str(anomalous_metric[2])
                    metric_trigger = str(anomalous_metric[5])
                    alert_threshold = int(anomalous_metric[6])
                    metric_alerters = anomalous_metric[7]
                    algorithm = anomalous_metric[8]
                    if ENABLE_BOUNDARY_DEBUG:
                        logger.info("debug - anomalous_metric - " + str(anomalous_metric))

                    # Determine how many times has the anomaly been seen if the
                    # ALERT_THRESHOLD is set to > 1 and create a cache key in
                    # redis to keep count so that alert_threshold can be honored
                    if alert_threshold == 0:
                        times_seen = 1
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - alert_threshold - " + str(alert_threshold))

                    if alert_threshold == 1:
                        times_seen = 1
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - alert_threshold - " + str(alert_threshold))

                    if alert_threshold > 1:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - alert_threshold - " + str(alert_threshold))
                        anomaly_cache_key_count_set = False
                        anomaly_cache_key_expiration_time = (int(alert_threshold) + 1) * 60
                        anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name)
                        try:
                            anomaly_cache_key_count = self.redis_conn.get(anomaly_cache_key)
                            if not anomaly_cache_key_count:
                                try:
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info("debug - redis no anomaly_cache_key - " + str(anomaly_cache_key))
                                    times_seen = 1
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info("debug - redis setex anomaly_cache_key - " + str(anomaly_cache_key))
                                    self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen)))
                                    logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen)))
                                except Exception as e:
                                    logger.error('redis setex failed :: %s' % str(anomaly_cache_key))
                                    logger.error("couldn't set key: %s" % e)
                            else:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - redis anomaly_cache_key retrieved OK - " + str(anomaly_cache_key))
                                anomaly_cache_key_count_set = True
                        except:
                            if ENABLE_BOUNDARY_DEBUG:
                                logger.info("debug - redis failed - anomaly_cache_key retrieval failed - " + str(anomaly_cache_key))
                            anomaly_cache_key_count_set = False

                        if anomaly_cache_key_count_set:
                            unpacker = Unpacker(use_list=False)
                            unpacker.feed(anomaly_cache_key_count)
                            raw_times_seen = list(unpacker)
                            times_seen = int(raw_times_seen[0]) + 1
                            try:
                                self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen)))
                                logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen)))
                            except:
                                times_seen = 1
                                logger.error('set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen)))

                    # Alert the alerters if times_seen > alert_threshold
                    if times_seen >= alert_threshold:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - times_seen %s is greater than or equal to alert_threshold %s" % (str(times_seen), str(alert_threshold)))
                        for alerter in metric_alerters.split("|"):
                            # Determine alerter limits
                            send_alert = False
                            alerts_sent = 0
                            if ENABLE_BOUNDARY_DEBUG:
                                logger.info("debug - checking alerter - %s" % alerter)
                            try:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - determining alerter_expiration_time for settings")
                                alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_expiration_time'][alerter]
                                alerter_expiration_time = int(alerter_expiration_time_setting)
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - determined alerter_expiration_time from settings - %s" % str(alerter_expiration_time))
                            except:
                                # Set an arbitrary expiry time if not set
                                alerter_expiration_time = 160
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - could not determine alerter_expiration_time from settings")
                            try:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - determining alerter_limit from settings")
                                alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_limit'][alerter]
                                alerter_limit = int(alerter_limit_setting)
                                alerter_limit_set = True
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - determined alerter_limit from settings - %s" % str(alerter_limit))
                            except:
                                alerter_limit_set = False
                                send_alert = True
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - could not determine alerter_limit from settings")

                            # If the alerter_limit is set determine how many
                            # alerts the alerter has sent
                            if alerter_limit_set:
                                alerter_sent_count_key = 'alerts_sent.%s' % (alerter)
                                try:
                                    alerter_sent_count_key_data = self.redis_conn.get(alerter_sent_count_key)
                                    if not alerter_sent_count_key_data:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug - redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key))
                                        alerts_sent = 0
                                        send_alert = True
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug - alerts_sent set to %s" % str(alerts_sent))
                                            logger.info("debug - send_alert set to %s" % str(sent_alert))
                                    else:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug - redis alerter key retrieved, unpacking" + str(alerter_sent_count_key))
                                        unpacker = Unpacker(use_list=False)
                                        unpacker.feed(alerter_sent_count_key_data)
                                        raw_alerts_sent = list(unpacker)
                                        alerts_sent = int(raw_alerts_sent[0])
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug - alerter %s alerts sent %s " % (str(alerter), str(alerts_sent)))
                                except:
                                    logger.info("No key set - %s" % alerter_sent_count_key)
                                    alerts_sent = 0
                                    send_alert = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info("debug - alerts_sent set to %s" % str(alerts_sent))
                                        logger.info("debug - send_alert set to %s" % str(send_alert))

                                if alerts_sent < alerter_limit:
                                    send_alert = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info("debug - alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit)))
                                        logger.info("debug - send_alert set to %s" % str(send_alert))

                            # Send alert
                            alerter_alert_sent = False
                            if send_alert:
                                cache_key = 'last_alert.boundary.%s.%s.%s' % (alerter, base_name, algorithm)
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug - checking cache_key - %s" % cache_key)
                                try:
                                    last_alert = self.redis_conn.get(cache_key)
                                    if not last_alert:
                                        try:
                                            self.redis_conn.setex(cache_key, int(anomalous_metric[2]), packb(int(anomalous_metric[0])))
                                            if ENABLE_BOUNDARY_DEBUG:
                                                logger.info('debug - key setex OK - %s' % (cache_key))
                                            trigger_alert(alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                            logger.info('alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm))
                                            trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                            logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))
                                            alerter_alert_sent = True
                                        except Exception as e:
                                            logger.error('alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm))
                                            logger.error("couldn't send alert: %s" % str(e))
                                            trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                    else:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug - cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key))
                                        trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                        logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))
                                except:
                                    trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                    logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))
                            else:
                                trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))

                            # Update the alerts sent for the alerter cache key,
                            # to allow for alert limiting
                            if alerter_alert_sent and alerter_limit_set:
                                try:
                                    alerter_sent_count_key = 'alerts_sent.%s' % (alerter)
                                    new_alerts_sent = int(alerts_sent) + 1
                                    self.redis_conn.setex(alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent)))
                                    logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent)))
                                except:
                                    logger.error('failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent)))

                    else:
                        # Always alert to syslog, even if alert_threshold is not
                        # breached or if send_alert is not True
                        trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                        logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))

            # Write anomalous_metrics to static webapp directory
            if len(self.anomalous_metrics) > 0:
                filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % anomalous_metrics)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(boundary_metrics))
            logger.info('total analyzed    :: %d' % (len(boundary_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'run_time', '%.2f' % (time() - now))
            self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_analyzed', '%.2f' % (len(boundary_metrics) - sum(exceptions.values())))
            self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_anomalies', '%d' % len(self.anomalous_metrics))
            self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_metrics', '%d' % len(boundary_metrics))
            for key, value in exceptions.items():
                send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'exceptions.%s' % key
                self.send_graphite_metric(send_metric, '%d' % value)
            for key, value in anomaly_breakdown.items():
                send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key
                self.send_graphite_metric(send_metric, '%d' % value)

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'duration', '%.2f' % time_human)
                self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'projected', '%.2f' % projected)

            # Reset counters
            self.anomalous_metrics[:] = []

            # Only run once per minute
            seconds_to_run = int((time() - now))
            if seconds_to_run < 60:
                sleep_for_seconds = 60 - seconds_to_run
            else:
                sleep_for_seconds = 0
            if sleep_for_seconds > 0:
                logger.info('sleeping for %s seconds' % sleep_for_seconds)
                sleep(sleep_for_seconds)
Exemplo n.º 8
0
    syslog_enabled = False

print 'Verifying alerts for: "' + options.metric + '"'

# Send alerts
context = 'Analyzer'
if alerts_enabled:
    for alert in alerts:
        if alert[0] in options.metric:
            print '    Testing Analyzer alerting - against "' + alert[
                0] + '" to send via ' + alert[1] + "...triggered"
            if options.trigger:
                print '    Testing Analyzer alerting - with options.trigger - ' + str(
                    options.trigger)
                metric = (0, options.metric)
                trigger_alert(alert, metric, context)
                if syslog_enabled:
                    print '    Testing Analyzer alerting - against "' + alert[
                        0] + '" to send via syslog ' + "...triggered"
                    alert = (alert[0], 'syslog')
                    trigger_alert(alert, metric, context)
        else:
            print '    Testing Analyzer alerting - against "' + alert[
                0] + '" to send via ' + alert[1] + "..."
else:
    print 'Alerts are disabled'

# Mirage alerts
context = 'Mirage'
try:
    mirage_enabled = settings.ENABLE_MIRAGE
Exemplo n.º 9
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    'skyline can\'t connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(
                    unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(
                self.redis_conn.smembers(settings.FULL_NAMESPACE +
                                         'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info(
                    'no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info(
                        'WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        if alert[0] in metric[1]:
                            cache_key = 'last_alert.%s.%s' % (alert[1],
                                                              metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    self.redis_conn.setex(
                                        cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(alert, metric)

                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)

            # Write anomalous_metrics to static webapp directory
            filename = path.abspath(
                path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
            with open(filename, 'w') as fh:
                # Make it JSONP with a handle_data() function
                anomalous_metrics = list(self.anomalous_metrics)
                anomalous_metrics.sort(key=operator.itemgetter(1))
                fh.write('handle_data(%s)' % anomalous_metrics)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' %
                        (len(unique_metrics) - sum(self.exceptions.values())))
            logger.info('total anomalies   :: %d' %
                        len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % self.exceptions)
            logger.info('anomaly breakdown :: %s' % self.anomaly_breakdown)

            # Log to Graphite
            if settings.GRAPHITE_HOST != '':
                host = settings.GRAPHITE_HOST.replace('http://', '')
                system(
                    'echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003'
                    % ((time() - now), now, host))
                system(
                    'echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003'
                    % ((len(unique_metrics) - sum(self.exceptions.values())),
                       now, host))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE +
                                             settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                if settings.GRAPHITE_HOST != '':
                    host = settings.GRAPHITE_HOST.replace('http://', '')
                    system(
                        'echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003'
                        % (time_human, now, host))
                    system(
                        'echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003'
                        % (projected, now, host))

            # Reset counters
            self.anomalous_metrics[:] = []
            self.exceptions = Manager().dict()
            self.anomaly_breakdown = Manager().dict()

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
Exemplo n.º 10
0
    "-m",
    "--metric",
    dest="metric",
    default='skyline.horizon.queue_size',
    help="Pass the metric to test (default is skyline.horizon.queue_size)")

(options, args) = parser.parse_args()

try:
    alerts_enabled = settings.ENABLE_ALERTS
    alerts = settings.ALERTS
except:
    print "Exception: Check your settings file for the existence of ENABLE_ALERTS and ALERTS"
    sys.exit()

print 'Verifying alerts for: "' + options.metric + '"'

# Send alerts
if alerts_enabled:
    for alert in alerts:
        if alert[0] in options.metric:
            print '    Testing against "' + alert[
                0] + '" to send via ' + alert[1] + "...triggered"
            if options.trigger:
                trigger_alert(alert, options.metric)
        else:
            print '    Testing against "' + alert[
                0] + '" to send via ' + alert[1] + "..."
else:
    print 'Alerts are disabled'
Exemplo n.º 11
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    'skyline can\'t connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(
                    unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(
                self.redis_conn.smembers(settings.FULL_NAMESPACE +
                                         'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info(
                    'no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Reset boundary_metrics
            boundary_metrics = []

            # Build boundary metrics
            for metric_name in unique_metrics:
                for metric in BOUNDARY_METRICS:
                    CHECK_MATCH_PATTERN = metric[0]
                    check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                    base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                    pattern_match = check_match_pattern.match(base_name)
                    if pattern_match:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info(
                                "debug - boundary metric - pattern MATCHED - "
                                + metric[0] + " | " + base_name)
                        boundary_metrics.append([metric_name, metric[1]])

            if ENABLE_BOUNDARY_DEBUG:
                logger.info("debug - boundary metrics - " +
                            str(boundary_metrics))

            if len(boundary_metrics) == 0:
                logger.info(
                    'no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.BOUNDARY_PROCESSES + 1):
                if i > len(boundary_metrics):
                    logger.info(
                        'WARNING: skyline boundary is set for more cores than needed.'
                    )
                    break

                p = Process(target=self.spin_process,
                            args=(i, boundary_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.BOUNDARY_ENABLE_ALERTS:
                for anomalous_metric in self.anomalous_metrics:
                    datapoint = str(anomalous_metric[0])
                    metric_name = anomalous_metric[1]
                    base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                    expiration_time = str(anomalous_metric[2])
                    metric_trigger = str(anomalous_metric[5])
                    alert_threshold = int(anomalous_metric[6])
                    metric_alerters = anomalous_metric[7]
                    algorithm = anomalous_metric[8]
                    if ENABLE_BOUNDARY_DEBUG:
                        logger.info("debug - anomalous_metric - " +
                                    str(anomalous_metric))

                    # Determine how many times has the anomaly been seen if the
                    # ALERT_THRESHOLD is set to > 1 and create a cache key in
                    # redis to keep count so that alert_threshold can be honored
                    if alert_threshold == 0:
                        times_seen = 1
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - alert_threshold - " +
                                        str(alert_threshold))

                    if alert_threshold == 1:
                        times_seen = 1
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - alert_threshold - " +
                                        str(alert_threshold))

                    if alert_threshold > 1:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug - alert_threshold - " +
                                        str(alert_threshold))
                        anomaly_cache_key_count_set = False
                        anomaly_cache_key_expiration_time = (
                            int(alert_threshold) + 1) * 60
                        anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm,
                                                                    base_name)
                        try:
                            anomaly_cache_key_count = self.redis_conn.get(
                                anomaly_cache_key)
                            if not anomaly_cache_key_count:
                                try:
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info(
                                            "debug - redis no anomaly_cache_key - "
                                            + str(anomaly_cache_key))
                                    times_seen = 1
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info(
                                            "debug - redis setex anomaly_cache_key - "
                                            + str(anomaly_cache_key))
                                    self.redis_conn.setex(
                                        anomaly_cache_key,
                                        anomaly_cache_key_expiration_time,
                                        packb(int(times_seen)))
                                    logger.info(
                                        'set anomaly seen key :: %s seen %s' %
                                        (anomaly_cache_key, str(times_seen)))
                                except Exception as e:
                                    logger.error('redis setex failed :: %s' %
                                                 str(anomaly_cache_key))
                                    logger.error("couldn't set key: %s" % e)
                            else:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - redis anomaly_cache_key retrieved OK - "
                                        + str(anomaly_cache_key))
                                anomaly_cache_key_count_set = True
                        except:
                            if ENABLE_BOUNDARY_DEBUG:
                                logger.info(
                                    "debug - redis failed - anomaly_cache_key retrieval failed - "
                                    + str(anomaly_cache_key))
                            anomaly_cache_key_count_set = False

                        if anomaly_cache_key_count_set:
                            unpacker = Unpacker(use_list=False)
                            unpacker.feed(anomaly_cache_key_count)
                            raw_times_seen = list(unpacker)
                            times_seen = int(raw_times_seen[0]) + 1
                            try:
                                self.redis_conn.setex(
                                    anomaly_cache_key,
                                    anomaly_cache_key_expiration_time,
                                    packb(int(times_seen)))
                                logger.info(
                                    'set anomaly seen key :: %s seen %s' %
                                    (anomaly_cache_key, str(times_seen)))
                            except:
                                times_seen = 1
                                logger.error(
                                    'set anomaly seen key failed :: %s seen %s'
                                    % (anomaly_cache_key, str(times_seen)))

                    # Alert the alerters if times_seen > alert_threshold
                    if times_seen >= alert_threshold:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info(
                                "debug - times_seen %s is greater than or equal to alert_threshold %s"
                                % (str(times_seen), str(alert_threshold)))
                        for alerter in metric_alerters.split("|"):
                            # Determine alerter limits
                            send_alert = False
                            alerts_sent = 0
                            if ENABLE_BOUNDARY_DEBUG:
                                logger.info("debug - checking alerter - %s" %
                                            alerter)
                            try:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - determining alerter_expiration_time for settings"
                                    )
                                alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS[
                                    'alerter_expiration_time'][alerter]
                                alerter_expiration_time = int(
                                    alerter_expiration_time_setting)
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - determined alerter_expiration_time from settings - %s"
                                        % str(alerter_expiration_time))
                            except:
                                # Set an arbitrary expiry time if not set
                                alerter_expiration_time = 160
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - could not determine alerter_expiration_time from settings"
                                    )
                            try:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - determining alerter_limit from settings"
                                    )
                                alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS[
                                    'alerter_limit'][alerter]
                                alerter_limit = int(alerter_limit_setting)
                                alerter_limit_set = True
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - determined alerter_limit from settings - %s"
                                        % str(alerter_limit))
                            except:
                                alerter_limit_set = False
                                send_alert = True
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - could not determine alerter_limit from settings"
                                    )

                            # If the alerter_limit is set determine how many
                            # alerts the alerter has sent
                            if alerter_limit_set:
                                alerter_sent_count_key = 'alerts_sent.%s' % (
                                    alerter)
                                try:
                                    alerter_sent_count_key_data = self.redis_conn.get(
                                        alerter_sent_count_key)
                                    if not alerter_sent_count_key_data:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info(
                                                "debug - redis no alerter key, no alerts sent for - "
                                                + str(alerter_sent_count_key))
                                        alerts_sent = 0
                                        send_alert = True
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info(
                                                "debug - alerts_sent set to %s"
                                                % str(alerts_sent))
                                            logger.info(
                                                "debug - send_alert set to %s"
                                                % str(sent_alert))
                                    else:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info(
                                                "debug - redis alerter key retrieved, unpacking"
                                                + str(alerter_sent_count_key))
                                        unpacker = Unpacker(use_list=False)
                                        unpacker.feed(
                                            alerter_sent_count_key_data)
                                        raw_alerts_sent = list(unpacker)
                                        alerts_sent = int(raw_alerts_sent[0])
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info(
                                                "debug - alerter %s alerts sent %s "
                                                % (str(alerter),
                                                   str(alerts_sent)))
                                except:
                                    logger.info("No key set - %s" %
                                                alerter_sent_count_key)
                                    alerts_sent = 0
                                    send_alert = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info(
                                            "debug - alerts_sent set to %s" %
                                            str(alerts_sent))
                                        logger.info(
                                            "debug - send_alert set to %s" %
                                            str(send_alert))

                                if alerts_sent < alerter_limit:
                                    send_alert = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info(
                                            "debug - alerts_sent %s is less than alerter_limit %s"
                                            % (str(alerts_sent),
                                               str(alerter_limit)))
                                        logger.info(
                                            "debug - send_alert set to %s" %
                                            str(send_alert))

                            # Send alert
                            alerter_alert_sent = False
                            if send_alert:
                                cache_key = 'last_alert.boundary.%s.%s.%s' % (
                                    alerter, base_name, algorithm)
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info(
                                        "debug - checking cache_key - %s" %
                                        cache_key)
                                try:
                                    last_alert = self.redis_conn.get(cache_key)
                                    if not last_alert:
                                        try:
                                            self.redis_conn.setex(
                                                cache_key,
                                                int(anomalous_metric[2]),
                                                packb(int(
                                                    anomalous_metric[0])))
                                            if ENABLE_BOUNDARY_DEBUG:
                                                logger.info(
                                                    'debug - key setex OK - %s'
                                                    % (cache_key))
                                            trigger_alert(
                                                alerter, datapoint, base_name,
                                                expiration_time,
                                                metric_trigger, algorithm)
                                            logger.info(
                                                'alert sent :: %s - %s - via %s - %s'
                                                % (base_name, datapoint,
                                                   alerter, algorithm))
                                            trigger_alert(
                                                "syslog", datapoint, base_name,
                                                expiration_time,
                                                metric_trigger, algorithm)
                                            logger.info(
                                                'alert sent :: %s - %s - via syslog - %s'
                                                % (base_name, datapoint,
                                                   algorithm))
                                            alerter_alert_sent = True
                                        except Exception as e:
                                            logger.error(
                                                'alert failed :: %s - %s - via %s - %s'
                                                % (base_name, datapoint,
                                                   alerter, algorithm))
                                            logger.error(
                                                "couldn't send alert: %s" %
                                                str(e))
                                            trigger_alert(
                                                "syslog", datapoint, base_name,
                                                expiration_time,
                                                metric_trigger, algorithm)
                                    else:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info(
                                                "debug - cache_key exists not alerting via %s for %s is less than alerter_limit %s"
                                                % (alerter, cache_key))
                                        trigger_alert("syslog", datapoint,
                                                      base_name,
                                                      expiration_time,
                                                      metric_trigger,
                                                      algorithm)
                                        logger.info(
                                            'alert sent :: %s - %s - via syslog - %s'
                                            %
                                            (base_name, datapoint, algorithm))
                                except:
                                    trigger_alert("syslog", datapoint,
                                                  base_name, expiration_time,
                                                  metric_trigger, algorithm)
                                    logger.info(
                                        'alert sent :: %s - %s - via syslog - %s'
                                        % (base_name, datapoint, algorithm))
                            else:
                                trigger_alert("syslog", datapoint, base_name,
                                              expiration_time, metric_trigger,
                                              algorithm)
                                logger.info(
                                    'alert sent :: %s - %s - via syslog - %s' %
                                    (base_name, datapoint, algorithm))

                            # Update the alerts sent for the alerter cache key,
                            # to allow for alert limiting
                            if alerter_alert_sent and alerter_limit_set:
                                try:
                                    alerter_sent_count_key = 'alerts_sent.%s' % (
                                        alerter)
                                    new_alerts_sent = int(alerts_sent) + 1
                                    self.redis_conn.setex(
                                        alerter_sent_count_key,
                                        alerter_expiration_time,
                                        packb(int(new_alerts_sent)))
                                    logger.info('set %s - %s' %
                                                (alerter_sent_count_key,
                                                 str(new_alerts_sent)))
                                except:
                                    logger.error('failed to set %s - %s' %
                                                 (alerter_sent_count_key,
                                                  str(new_alerts_sent)))

                    else:
                        # Always alert to syslog, even if alert_threshold is not
                        # breached or if send_alert is not True
                        trigger_alert("syslog", datapoint, base_name,
                                      expiration_time, metric_trigger,
                                      algorithm)
                        logger.info('alert sent :: %s - %s - via syslog - %s' %
                                    (base_name, datapoint, algorithm))

            # Write anomalous_metrics to static webapp directory
            if len(self.anomalous_metrics) > 0:
                filename = path.abspath(
                    path.join(path.dirname(__file__), '..',
                              settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % anomalous_metrics)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(boundary_metrics))
            logger.info('total analyzed    :: %d' %
                        (len(boundary_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' %
                        len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            self.send_graphite_metric(
                'skyline.boundary.' + SERVER_METRIC_PATH + 'run_time',
                '%.2f' % (time() - now))
            self.send_graphite_metric(
                'skyline.boundary.' + SERVER_METRIC_PATH + 'total_analyzed',
                '%.2f' % (len(boundary_metrics) - sum(exceptions.values())))
            self.send_graphite_metric(
                'skyline.boundary.' + SERVER_METRIC_PATH + 'total_anomalies',
                '%d' % len(self.anomalous_metrics))
            self.send_graphite_metric(
                'skyline.boundary.' + SERVER_METRIC_PATH + 'total_metrics',
                '%d' % len(boundary_metrics))
            for key, value in exceptions.items():
                send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'exceptions.%s' % key
                self.send_graphite_metric(send_metric, '%d' % value)
            for key, value in anomaly_breakdown.items():
                send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key
                self.send_graphite_metric(send_metric, '%d' % value)

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE +
                                             settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                self.send_graphite_metric(
                    'skyline.boundary.' + SERVER_METRIC_PATH + 'duration',
                    '%.2f' % time_human)
                self.send_graphite_metric(
                    'skyline.boundary.' + SERVER_METRIC_PATH + 'projected',
                    '%.2f' % projected)

            # Reset counters
            self.anomalous_metrics[:] = []

            # Only run once per minute
            seconds_to_run = int((time() - now))
            if seconds_to_run < 60:
                sleep_for_seconds = 60 - seconds_to_run
            else:
                sleep_for_seconds = 0
            if sleep_for_seconds > 0:
                logger.info('sleeping for %s seconds' % sleep_for_seconds)
                sleep(sleep_for_seconds)
Exemplo n.º 12
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.info(
                    'skyline can not connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                logger.info('connecting to redis at socket path %s' %
                            settings.REDIS_SOCKET_PATH)
                self.redis_conn = StrictRedis(
                    unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue
            """
            Determine if any metric to analyze
            """
            while True:

                metric_var_files = [
                    f for f in listdir(settings.MIRAGE_CHECK_PATH)
                    if isfile(join(settings.MIRAGE_CHECK_PATH, f))
                ]
                if len(metric_var_files) == 0:
                    logger.info('sleeping no metrics...')
                    sleep(10)
                else:
                    sleep(1)

#                logger.info('sleeping no metrics...')
#                sleep(10)

# Clean up old files
                now_timestamp = time()
                stale_age = now_timestamp - settings.MIRAGE_STALE_SECONDS
                for current_file in listdir(settings.MIRAGE_CHECK_PATH):
                    if os.path.isfile(settings.MIRAGE_CHECK_PATH + "/" +
                                      current_file):
                        t = os.stat(settings.MIRAGE_CHECK_PATH + "/" +
                                    current_file)
                        c = t.st_ctime
                        # delete file if older than a week
                        if c < stale_age:
                            os.remove(settings.MIRAGE_CHECK_PATH + "/" +
                                      current_file)
                            logger.info('removed %s' % (current_file))

                # Discover metric to analyze
                metric_var_files = ''
                metric_var_files = [
                    f for f in listdir(settings.MIRAGE_CHECK_PATH)
                    if isfile(join(settings.MIRAGE_CHECK_PATH, f))
                ]
                if len(metric_var_files) > 0:
                    break

            metric_var_files_sorted = sorted(metric_var_files)
            metric_check_file = settings.MIRAGE_CHECK_PATH + "/" + metric_var_files_sorted[
                0]

            logger.info('processing %s' % metric_var_files_sorted[0])

            # Spawn processes
            pids = []
            MIRAGE_PROCESSES = 1
            run_timestamp = int(now)
            for i in range(1, MIRAGE_PROCESSES + 1):
                p = Process(target=self.spin_process, args=(i, run_timestamp))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.mirage_anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.mirage_exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            for metric_variable in self.metric_variables:
                if metric_variable[0] == 'metric_name':
                    metric_name = metric_variable[1]
                if metric_variable[0] == 'metric_value':
                    metric_value = metric_variable[1]
                if metric_variable[0] == 'hours_to_resolve':
                    hours_to_resolve = metric_variable[1]
                if metric_variable[0] == 'metric_timestamp':
                    metric_timestamp = metric_variable[1]

            logger.info('analysis done - %s' % metric_name)

            # Send alerts
            # Calculate hours second order resolution to seconds
            logger.info('analyzed at %s hours resolution' % hours_to_resolve)
            second_order_resolution_seconds = int(hours_to_resolve) * 3600
            logger.info('analyzed at %s seconds resolution' %
                        second_order_resolution_seconds)

            if settings.MIRAGE_ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        ALERT_MATCH_PATTERN = alert[0]
                        METRIC_PATTERN = metric[1]
                        alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                        pattern_match = alert_match_pattern.match(
                            METRIC_PATTERN)
                        if pattern_match:
                            cache_key = 'mirage.last_alert.%s.%s' % (alert[1],
                                                                     metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    self.redis_conn.setex(
                                        cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(
                                        alert, metric,
                                        second_order_resolution_seconds)
                                    logger.info("Sent %s alert: For %s" %
                                                (alert[1], metric[1]))

                            except Exception as e:
                                logger.error(
                                    "could not send %s alert for %s: %s" %
                                    (alert[1], metric[1], e))

            if settings.NEGATE_ANALYZER_ALERTS:
                if len(self.anomalous_metrics) == 0:
                    for negate_alert in settings.ALERTS:
                        for not_anomalous_metric in self.not_anomalous_metrics:
                            NEGATE_ALERT_MATCH_PATTERN = negate_alert[0]
                            NOT_ANOMALOUS_METRIC_PATTERN = not_anomalous_metric[
                                1]
                            alert_match_pattern = re.compile(
                                NEGATE_ALERT_MATCH_PATTERN)
                            negate_pattern_match = alert_match_pattern.match(
                                NOT_ANOMALOUS_METRIC_PATTERN)
                            if negate_pattern_match:
                                try:
                                    logger.info("Negate alert sent: For %s" %
                                                (not_anomalous_metric[1]))
                                    trigger_negater(
                                        negate_alert, not_anomalous_metric,
                                        second_order_resolution_seconds,
                                        metric_value)
                                except Exception as e:
                                    logger.error("couldn't send alert: %s" % e)

            # Log progress

            if len(self.anomalous_metrics) > 0:
                logger.info('seconds since last anomaly :: %.2f' %
                            (time() - now))
                logger.info('total anomalies   :: %d' %
                            len(self.anomalous_metrics))
                logger.info('exception stats   :: %s' % exceptions)
                logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Reset counters
            self.anomalous_metrics[:] = []
            self.not_anomalous_metrics[:] = []

            # Reset metric_variables
            self.metric_variables[:] = []

            # Sleep if it went too fast
            if time() - now < 1:
                logger.info('sleeping due to low run time...')
                #                sleep(10)
                sleep(1)
Exemplo n.º 13
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()
            # Make sure Redis is up
            try:
                self.ring.check_connections()
            except:
                sleep(10)
                self.ring = RedisRing(settings.REDIS_BACKENDS)
                continue

            # Discover unique metrics
            unique_metrics = list(self.ring.run('smembers', settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        if alert[0] in metric[1]:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.ring.run('get', cache_key)
                                if not last_alert:
                                    self.ring.run('setex', cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(alert, metric)

                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' % (len(unique_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            if settings.GRAPHITE_HOST != '':
                host = settings.GRAPHITE_HOST.replace('http://', '')
                system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host))
                system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(exceptions.values())), now, host))

            # Check canary metric
            raw_series = self.ring.run('get', settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                if settings.GRAPHITE_HOST != '':
                    host = settings.GRAPHITE_HOST.replace('http://', '')
                    system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host))
                    system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host))

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
Exemplo n.º 14
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        ALERT_MATCH_PATTERN = alert[0]
                        METRIC_PATTERN = metric[1]
                        alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                        pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                        if pattern_match:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    try:
                                        SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3]
                                        logger.info('mirage check      :: %s' % (metric[1]))
                                        # Write anomalous metric to test at second
                                        # order resolution by crucible to the check
                                        # file
                                        metric_timestamp = int(time())
                                        anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1])
                                        with open(anomaly_check_file, 'w') as fh:
                                            # metric_name, anomalous datapoint, hours to resolve, timestamp
                                            fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp))
                                            logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3]))
                                        if settings.ENABLE_FULL_DURATION_ALERTS:
                                            self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                            trigger_alert(alert, metric)
                                    except:
                                        self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                        trigger_alert(alert, metric)
                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)

            # Write anomalous_metrics to static webapp directory
            if len(self.anomalous_metrics) > 0:
                filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % anomalous_metrics)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' % (len(unique_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'run_time', '%.2f' % (time() - now))
            self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values())))
            self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'total_anomalies', '%d' % len(self.anomalous_metrics))
            self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'total_metrics', '%d' % len(unique_metrics))
            for key, value in exceptions.items():
                send_metric = 'skyline.analyzer.' + SERVER_METRIC_PATH + 'exceptions.%s' % key
                self.send_graphite_metric(send_metric, '%d' % value)
            for key, value in anomaly_breakdown.items():
                send_metric = 'skyline.analyzer.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key
                self.send_graphite_metric(send_metric, '%d' % value)

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'duration', '%.2f' % time_human)
                self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'projected', '%.2f' % projected)

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)