예제 #1
0
파일: worker.py 프로젝트: nevins-b/skyline
 def __init__(self, parent_pid, context, canary=False):
     super(Worker, self).__init__()
     self.context = context
     self.ring = RedisRing(settings.REDIS_BACKENDS)
     self.parent_pid = parent_pid
     self.daemon = True
     self.canary = canary
예제 #2
0
 def __init__(self, parent_pid):
     """
     Initialize the Analyzer
     """
     super(Analyzer, self).__init__()
     self.ring = RedisRing(settings.REDIS_BACKENDS)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
     self.anomalous_metrics = Manager().list()
     self.exceptions_q = Queue()
     self.anomaly_breakdown_q = Queue()
예제 #3
0
파일: worker.py 프로젝트: nevins-b/skyline
    def run(self):
        """
        Called when the process intializes.
        """
        logger.info('started worker')

        FULL_NAMESPACE = settings.FULL_NAMESPACE
        MINI_NAMESPACE = settings.MINI_NAMESPACE
        MAX_RESOLUTION = settings.MAX_RESOLUTION

        self.conn = self.context.socket(zmq.PULL)
        self.conn.connect(
            '{0}:{0}'.format(settings.RELAY_HOST, settings.RELAY_PUBLISH_PORT))
        self.poller = zmq.Poller()
        self.poller.register(self.conn, zmq.POLLIN)

        while 1:
            sockets = dict(self.poller.poll())
            if self.conn in sockets and sockets[self.conn] == zmq.POLLIN:
            # Make sure Redis is up
                try:
                    self.ring.check_connections()
                except:
                    sleep(10)
                    self.ring = RedisRing(settings.REDIS_BACKENDS)
                    continue

                try:
                    # Get a chunk from the queue with a 15 second timeout
                    chunk = self.q.get(True, 15)
                    now = time()

                    for metric in chunk:

                        # Check if we should skip it
                        if self.in_skip_list(metric[0]):
                            continue

                        # Bad data coming in
                        if metric[1][0] < now - MAX_RESOLUTION:
                            continue

                        for ns in [FULL_NAMESPACE, MINI_NAMESPACE]:
                            key = ''.join((ns, metric[0]))
                            self.ring.run('append', key, packb(metric[1]))
                            ukey = ''.join((ns,'unique_metrics.',metric[0]))
                            self.ring.run('sadd', ukey, key)

                except Exception as e:
                    logger.error("worker error: " + str(e))
예제 #4
0
class Analyzer(Thread):
    def __init__(self, parent_pid):
        """
        Initialize the Analyzer
        """
        super(Analyzer, self).__init__()
        self.ring = RedisRing(settings.REDIS_BACKENDS)
        self.daemon = True
        self.parent_pid = parent_pid
        self.current_pid = getpid()
        self.anomalous_metrics = Manager().list()
        self.exceptions_q = Queue()
        self.anomaly_breakdown_q = Queue()

    def check_if_parent_is_alive(self):
        """
        Self explanatory
        """
        try:
            kill(self.current_pid, 0)
            kill(self.parent_pid, 0)
        except:
            exit(0)

    def spin_process(self, i, unique_metrics):
        process_key = '.'.join(['skyline','analyzer', socket.gethostname(), str(i)])
        alive_key = '.'.join([process_key, 'alive'])
        self.ring.run('set', alive_key, 1)
        self.ring.run('expire', alive_key, 30)
        """
        Assign a bunch of metrics for a process to analyze.
        """
        processes = list(self.ring.run('zrange', settings.ANALYZER_PROCESS_KEY, 0, -1))
        for key in processes:
            value = self.ring.run('get', key)
            if not value:
                self.ring.run('zrem', settings.ANALYZER_PROCESS_KEY, 0, key)

        # Add current process to index and determine position
        if not self.ring.run('zscore', settings.ANALYZER_PROCESS_KEY, alive_key):
            self.ring.run('zadd', settings.ANALYZER_PROCESS_KEY, time(), alive_key)
        self.ring.run('expire', settings.ANALYZER_PROCESS_KEY, 60)
        process_position = self.ring.run('zrank', settings.ANALYZER_PROCESS_KEY, alive_key) + 1
        process_count = self.ring.run('zcard', settings.ANALYZER_PROCESS_KEY)

        # If there are less processes then we know are going to be running assume
        # the others will start
        if process_count < settings.ANALYZER_PROCESSES:
            process_count = settings.ANALYZER_PROCESSES

        # Discover assigned metrics
        keys_per_processor = int(ceil(float(len(unique_metrics)) / float(process_count)))
        if process_position == process_count:
            assigned_max = len(unique_metrics)
        else:
            assigned_max = process_position * keys_per_processor
        assigned_min = assigned_max - keys_per_processor
        assigned_keys = range(assigned_min, assigned_max)

        # Compile assigned metrics
        assigned_metrics = [unique_metrics[index] for index in assigned_keys]

        # Check if this process is unnecessary
        if len(assigned_metrics) == 0:
            return

        # Multi get series
        raw_assigned = self.ring.run('mget', assigned_metrics)

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        # Distill timeseries strings into lists
        for i, metric_name in enumerate(assigned_metrics):
            self.check_if_parent_is_alive()

            try:
                raw_series = raw_assigned[i]
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)

                anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name)

                # If it's anomalous, add it to list
                if anomalous:
                    base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1)
                    metric = [datapoint, base_name]
                    self.anomalous_metrics.append(metric)

                    # Get the anomaly breakdown - who returned True?
                    for index, value in enumerate(ensemble):
                        if value:
                            algorithm = settings.ALGORITHMS[index]
                            anomaly_breakdown[algorithm] += 1

            # It could have been deleted by the Roomba
            except TypeError:
                exceptions['DeletedByRoomba'] += 1
            except TooShort:
                exceptions['TooShort'] += 1
            except Stale:
                exceptions['Stale'] += 1
            except Incomplete:
                exceptions['Incomplete'] += 1
            except Boring:
                exceptions['Boring'] += 1
            except:
                exceptions['Other'] += 1
                logger.info(traceback.format_exc())

        # if anomalies detected Pack and Write anomoly data to Redis
        if len(anomalous_metrics) > 0:
            packed = Packer().pack(anomalous_metrics)
            self.ring.run('set', process_key, packed)
            # expire the key in 30s so anomalys don't show up for too long
            self.ring.run('expire', process_key, 30)
            self.ring.run('sadd', settings.ANALYZER_ANOMALY_KEY, process_key)
            # expire the key in 60s so anomalys don't show up for too long
            self.ring.run('expire', settings.ANALYZER_ANOMALY_KEY, 60)

        # Collate process-specific dicts to main dicts
        with self.lock:
            for key, value in anomaly_breakdown.items():
                if key not in self.anomaly_breakdown:
                    self.anomaly_breakdown[key] = value
                else:
        	        self.anomaly_breakdown[key] += value

            for key, value in exceptions.items():
                if key not in self.exceptions:
                    self.exceptions[key] = value
                else:
        	        self.exceptions[key] += value

        for key, value in exceptions.items():
            self.exceptions_q.put((key, value))

    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()
            # Make sure Redis is up
            try:
                self.ring.check_connections()
            except:
                sleep(10)
                self.ring = RedisRing(settings.REDIS_BACKENDS)
                continue

            # Discover unique metrics
            unique_metrics = list(self.ring.run('smembers', settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        if alert[0] in metric[1]:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.ring.run('get', cache_key)
                                if not last_alert:
                                    self.ring.run('setex', cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(alert, metric)

                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' % (len(unique_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            if settings.GRAPHITE_HOST != '':
                host = settings.GRAPHITE_HOST.replace('http://', '')
                system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host))
                system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(exceptions.values())), now, host))

            # Check canary metric
            raw_series = self.ring.run('get', settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                if settings.GRAPHITE_HOST != '':
                    host = settings.GRAPHITE_HOST.replace('http://', '')
                    system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host))
                    system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host))

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
예제 #5
0
    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()
            # Make sure Redis is up
            try:
                self.ring.check_connections()
            except:
                sleep(10)
                self.ring = RedisRing(settings.REDIS_BACKENDS)
                continue

            # Discover unique metrics
            unique_metrics = list(self.ring.run('smembers', settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        if alert[0] in metric[1]:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.ring.run('get', cache_key)
                                if not last_alert:
                                    self.ring.run('setex', cache_key, alert[2], packb(metric[0]))
                                    trigger_alert(alert, metric)

                            except Exception as e:
                                logger.error("couldn't send alert: %s" % e)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' % (len(unique_metrics) - sum(exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            if settings.GRAPHITE_HOST != '':
                host = settings.GRAPHITE_HOST.replace('http://', '')
                system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host))
                system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(exceptions.values())), now, host))

            # Check canary metric
            raw_series = self.ring.run('get', settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                if settings.GRAPHITE_HOST != '':
                    host = settings.GRAPHITE_HOST.replace('http://', '')
                    system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host))
                    system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host))

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
예제 #6
0
파일: worker.py 프로젝트: nevins-b/skyline
class Worker(Process):
    """
    The worker processes chunks from the queue and appends
    the latest datapoints to their respective timesteps in Redis.
    """
    def __init__(self, parent_pid, context, canary=False):
        super(Worker, self).__init__()
        self.context = context
        self.ring = RedisRing(settings.REDIS_BACKENDS)
        self.parent_pid = parent_pid
        self.daemon = True
        self.canary = canary

    def check_if_parent_is_alive(self):
        """
        Self explanatory.
        """
        try:
            kill(self.parent_pid, 0)
        except:
            exit(0)

    def in_skip_list(self, metric_name):
        """
        Check if the metric is in SKIP_LIST.
        """
        for to_skip in settings.SKIP_LIST:
            if to_skip in metric_name:
                return True

        return False

    def run(self):
        """
        Called when the process intializes.
        """
        logger.info('started worker')

        FULL_NAMESPACE = settings.FULL_NAMESPACE
        MINI_NAMESPACE = settings.MINI_NAMESPACE
        MAX_RESOLUTION = settings.MAX_RESOLUTION

        self.conn = self.context.socket(zmq.PULL)
        self.conn.connect(
            '{0}:{0}'.format(settings.RELAY_HOST, settings.RELAY_PUBLISH_PORT))
        self.poller = zmq.Poller()
        self.poller.register(self.conn, zmq.POLLIN)

        while 1:
            sockets = dict(self.poller.poll())
            if self.conn in sockets and sockets[self.conn] == zmq.POLLIN:
            # Make sure Redis is up
                try:
                    self.ring.check_connections()
                except:
                    sleep(10)
                    self.ring = RedisRing(settings.REDIS_BACKENDS)
                    continue

                try:
                    # Get a chunk from the queue with a 15 second timeout
                    chunk = self.q.get(True, 15)
                    now = time()

                    for metric in chunk:

                        # Check if we should skip it
                        if self.in_skip_list(metric[0]):
                            continue

                        # Bad data coming in
                        if metric[1][0] < now - MAX_RESOLUTION:
                            continue

                        for ns in [FULL_NAMESPACE, MINI_NAMESPACE]:
                            key = ''.join((ns, metric[0]))
                            self.ring.run('append', key, packb(metric[1]))
                            ukey = ''.join((ns,'unique_metrics.',metric[0]))
                            self.ring.run('sadd', ukey, key)

                except Exception as e:
                    logger.error("worker error: " + str(e))