def __init__(self): self.enabled = False if 'api_monitor' in os.environ and int(os.environ['api_monitor']) == 1: self.enabled = True self.use_prometheus = False if 'api_monitor_prometheus' in os.environ and int( os.environ['api_monitor_prometheus']) == 1: self.use_prometheus = True if self.enabled: self.monitor_target = os.environ['db_name'].replace("_", "-") \ if 'db_name' in os.environ else 'trews;' if self.use_prometheus: self.prometheus = PrometheusMonitor() else: self._push_period_secs = int(os.environ['api_monitor_cw_push_period']) \ if 'api_monitor_cw_push_period' in os.environ else 60 # k8s pods have their pod name set as the hostname. stream_id = os.environ[ 'HOSTNAME'] if 'HOSTNAME' in os.environ else 'api-testing' self.cw_metrics = FluentMetric().with_namespace( 'OpsDX').with_stream_id(stream_id) # Latencies and request counters. self._counters = {} self._latencies = {} # General-purpose metrics. self._metrics = {} self._metric_specs = {}
def test_adding_timer_starts_timer(): name = 'test_timer' m = FluentMetric() m.with_timer(name) time.sleep(1) t = m.get_timer(name) assert t.start < arrow.utcnow() assert t.elapsed_in_ms() > 1000 and t.elapsed_in_ms() < 2000
def with_metric(self, **kwargs): ns = kwargs.get('Namespace') m = kwargs.get('Metric') clean = kwargs.get('Clean', False) if self.metric_namespace == ns and self.metric_name == m: return self self.metric_name = m self.metric_namespace = ns self.metric = FluentMetric().with_namespace(self.metric_namespace) if not clean: self.dirty = True return self
def test_can_push_dimensions(): test_name = 'test_name' test_value = 'test_value' m = FluentMetric() m.with_dimension(test_name, test_value) assert m.does_dimension_exist(test_name) m.push_dimensions() assert len(m.dimensions) == 0 m.pop_dimensions() assert len(m.dimensions) == 1
def test_removing_dimension_removes_dimension(): test_name = 'test_name' test_value = 'test_value' m = FluentMetric() m.with_dimension(test_name, test_value) assert m.does_dimension_exist(test_name) m.without_dimension(test_name) assert not m.does_dimension_exist(test_name)
def test_dimension_does_not_duplicate(): test_name = 'test_name' test_value1 = 'test_value1' test_value2 = 'test_value2' m = FluentMetric() m.with_dimension(test_name, test_value1) assert m.get_dimension_value(test_name) == test_value1 m.with_dimension(test_name, test_value2) assert m.get_dimension_value(test_name) == test_value2
def __init__(self, source: str): self.INTERVAL = 60 self.hits_counter = AtomicCounter() self.data_counter = AtomicCounter() self._exit_event = threading.Event() self.metrics = (FluentMetric().with_storage_resolution( 60).with_namespace("SentinelConnectors").with_dimension( "Source", source))
def test_can_add_multiple_timers(): name1 = 'test_timer_1' name2 = 'test_timer_2' m = FluentMetric() m.with_timer(name1) time.sleep(1) t = m.get_timer(name1) assert t.start < arrow.utcnow() assert t.elapsed_in_ms() > 1000 and t.elapsed_in_ms() < 2000 m.with_timer(name2) time.sleep(1) u = m.get_timer(name2) assert u.start < arrow.utcnow() assert u.elapsed_in_ms() > 1000 and u.elapsed_in_ms() < 2000 assert t.elapsed_in_ms() > 2000
class TrackerBase(object): def __init__(self, **kwargs): self.friendly_id = kwargs.get('FriendlyId', None) self.id = kwargs.get('Id', str(uuid.uuid4())) self.name = kwargs.get('Name', self.id) self.children = [] self.state = TrackerState() self.estimated_seconds = kwargs.get('EstimatedSeconds', 0) self.parent_id = kwargs.get('ParentId', None) self.status_msg = None self.last_update = arrow.utcnow() self.source = kwargs.get('Source', None) self.is_in_progress = False self.is_done = False self.status = 'Not started' self.metric = None self.metric_name = None self.metric_namespace = None self.finish_time = None self.db_conn = kwargs.get('DbConnection') self.parent = None self.is_dirty = True self.has_parallel_children = kwargs.get('HasParallelChildren', False) def load(self, id): self.id = id return self.db_conn.get_all_by_id(id) def refresh(self): self = self.load(self.id) def print_node(self, lvl=0): if lvl > 0: spc = lvl - 1 print '{}|'.format(' ' * spc * 2) print '{}|'.format(' ' * spc * 2) print '{}{}{}'.format(' ' * spc * 2, '-' * 2, self.name) else: print self.name for child in self.children: child.print_node(lvl + 1) def print_tree(self): self.print_node() def get_tracker_progress_total(self, pe=None): t = 0 c = 0 if pe is None: pe = self for k in pe.progress_trackers.keys(): e = pe.progress_trackers[k] t = t + e.progress_total c = c + e.current_progress return c, t def get_progress_remaining(self): c, t = self.get_tracker_progress_total() return 1 - (c / t) def get_progress_complete(self): c, t = self.get_tracker_progress_total() return c / t def get_full_key(self): if not self.parent_id: return self.id else: return "{}".format(self.id) def inc_progress(self, val=1): self.db_conn.inc_progress(val) @property def stats(self): s = TrackerStats(Id=self.id, Trackers=self.trackers) return s def get_stats(self, **kwargs): id = kwargs.get('Id', None) tot = 0 if id: t = self.trackers[id] else: t = self.trackers[self.id] if len(t.children): for k in t.children: tot = tot + self.get_stats(k) tot = tot + len(t.children) return tot @property def total_estimate(self): if self.has_parallel_children: longest = 0 if len(self.children): secs = 0 for k in self.children: tot = k.total_estimate if self.has_parallel_children and tot > longest: longest = tot else: secs = secs + tot else: return self.estimated_seconds if self.has_parallel_children: return longest else: return secs def get_children_by_status(self, status): items = [] if len(self.children): for k in self.children: if len(status) == 0 or k.status in status: items.append(k) match = k.get_children_by_status(status) if len(match) > 0: items.extend(match) return items def start(self, **kwargs): if self.is_in_progress: logging.warning('{} is already started. Ignoring start()'.format( self.id)) return self if self.is_done: logging.warning('{} is done. Ignoring start()') return self m = kwargs.get('Message', None) if m: self.with_status_msg(m) self.start_time = kwargs.get('StartTime', arrow.utcnow()) if bool(kwargs.get('Parents', False)): if self.parent: self.parent.start(Parents=True) if self.parent and not self.parent.is_in_progress: raise Exception("You can't start a tracker if the parent isn't " + 'started') self.state.start(StartTime=self.start_time, EstimatedSeconds=self.estimated_seconds) self.status = 'In Progress' self.is_in_progress = True self.is_dirty = True return self def with_status_msg(self, s, clean=False): if not self.status_msg == s: self.status_msg = s if not clean: self.dirty = True return self @property def remaining_time_in_seconds(self): return self.total_estimate - self.elapsed_time_in_seconds @property def elapsed_time_in_seconds(self): return self.state.elapsed_time_in_seconds() def update(self, recursive=True): p = self.parent if p: p.update(False) if self.is_dirty: try: self.db_conn.update_tracker(self) except Exception as e: logging.error('Error persisting to DB: {}'.format(str(e))) raise self.is_dirty = False if recursive: for c in self.children: c.update() return self def to_update_item(self): ue = 'SET TrackerName=:name' eav = {} eav[':name'] = self.name if self.estimated_seconds: ue = ue + ', EstimatedSeconds=:est_sec' eav[':est_sec'] = self.estimated_seconds if self.state.start_time: ue = ue + ', StartTime=:start' eav[':start'] = self.state.start_time.isoformat() if self.state.finish_time: ue = ue + ', FinishTime=:finish' eav[':finish'] = self.state.finish_time.isoformat() if self.status_msg: ue = ue + ', StatusMessage=:status_msg' eav[':status_msg'] = self.status_msg if self.friendly_id: ue = ue + ', FriendlyId=:fid' eav[':fid'] = self.friendly_id if self.last_update: ue = ue + ', LastUpdate=:l_u' eav[':l_u'] = self.last_update.isoformat() if self.source: ue = ue + ', Source=:source' eav[':source'] = self.source if self.metric_namespace: ue = ue + ', MetricNamespace=:ns' eav[':ns'] = self.metric_namespace if self.status: ue = ue + ', TrackerStatus=:status' eav[':status'] = self.status if self.metric_name: ue = ue + ', MetricName=:m' eav[':m'] = self.metric_name ue = ue + ', IsInProgress=:i' eav[':i'] = self.is_in_progress ue = ue + ', HasParallelChildren=:hpc' eav[':hpc'] = self.has_parallel_children ue = ue + ', IsDone=:d' eav[':d'] = self.is_done return ue, json.loads(json.dumps(eav)) def to_json(self): j = {} j['name'] = self.name if self.estimated_seconds: j['est_sec'] = self.estimated_seconds if self.state.start_time: j['start'] = self.state.start_time.isoformat() if self.state.finish_time: j['finish'] = self.state.finish_time.isoformat() if self.status_msg: j['st_msg'] = self.status_msg if self.parent_id: j['pid'] = self.parent_id if self.friendly_id: j['fid'] = self.friendly_id if self.last_update: j['l_u'] = self.last_update.isoformat() if self.source: j['s'] = self.source if self.metric_namespace: j['m_ns'] = self.metric_namespace if self.metric: j['m'] = self.metric_name j['in_p'] = self.is_in_progress j['has_p'] = self.has_parallel_children j['d'] = self.is_done if self.status: j['st'] = self.status return json.loads(json.dumps(j)) @staticmethod def from_json(id, j): t = ProgressTracker(Id=id) if 'name' in j.keys(): t.with_name(j['name'], True) if 'est_sec' in j.keys(): t.with_estimated_seconds(j['est_sec'], True) if 'start' in j.keys(): t.with_start_time(arrow.get(j['start']), True) if 'finish' in j.keys(): t.with_finish_time(arrow.get(j['finish']), True) if 'st_msg' in j.keys(): t.with_status_msg(arrow.get(j['st_msg']), True) if 'fid' in j.keys(): t.with_friendly_id(j['fid'], True) if 'pid' in j.keys(): t.parent_id = j['pid'] if 'in_p' in j.keys(): t.is_in_progress = str(j['in_p']) == 'True' if 'st' in j.keys(): t.status = j['st'] if 's' in j.keys(): t.with_source(j['s'], True) if 'd' in j.keys(): t.is_done = str(j['d']) == 'True' if 'has_p' in j.keys(): t.has_parallel_children = str(j['has_p']) == 'True' if 'm_ns' in j.keys() and 'm' in j.keys(): ns = j['m_ns'] m = j['m'] t.with_metric(Namespace=ns, Metric=m, Clean=True) t.is_dirty = False return t def with_parallel_children(self): if not self.has_parallel_children: self.has_parallel_children = True self.is_dirty = True return self def without_parallel_children(self): if self.has_parallel_children: self.has_parallel_children = False self.is_dirty = True return self def with_tracker(self, t): t.db_conn = self.db_conn t.parent = self self.children.append(t) self.is_dirty = True return self def with_child(self, c): c.parent = self if self.children and c in self.children: return self self.is_dirty = True self.children.append(c) return self def with_estimated_seconds(self, e, clean=False): if not self.estimated_seconds == e: self.estimated_seconds = e if not clean: self.dirty = True return self def with_start_time(self, s, clean=False): if not self.start_time == s: self.start_time = s self.state.start_time = s if not clean: self.dirty = True return self def with_finish_time(self, f, clean=False): if not self.finish_time == f: self.finish_time = f self.state.finish_time = f if not clean: self.dirty = True return self def with_last_update(self, d): self.is_dirty = not self.last_update == d self.last_update = d return self def with_autosave(self): if self.autosave: return self self.is_dirty = True self.autosave = True def with_source(self, s, clean=False): if not self.source == s: self.source = s if not clean: self.dirty = True return self def with_friendly_id(self, f, clean=False): if not self.friendly_id == f: self.friendly_id = f if not clean: self.dirty = True return self def with_metric(self, **kwargs): ns = kwargs.get('Namespace') m = kwargs.get('Metric') clean = kwargs.get('Clean', False) if self.metric_namespace == ns and self.metric_name == m: return self self.metric_name = m self.metric_namespace = ns self.metric = FluentMetric().with_namespace(self.metric_namespace) if not clean: self.dirty = True return self def get_pct(self, m): t = self.all_children_count if m == 0 or t == 0: return 0 else: return float("{0:.2f}".format(m / t)) @property def not_started_pct(self): return self.get_pct(self.not_started_count) @property def in_progress_pct(self): return self.get_pct(self.in_progress_count) @property def canceled_pct(self): return self.get_pct(self.canceled_count) @property def succeeded_pct(self): return self.get_pct(self.succeeded_count) @property def failed_pct(self): return self.get_pct(self.failed_count) @property def done_pct(self): return self.get_pct(self.done_count) @property def paused_pct(self): return self.get_pct(self.paused_count) @property def not_started_count(self): return len(self.not_started) @property def in_progress_count(self): return len(self.in_progress) @property def canceled_count(self): return len(self.canceled) @property def succeeded_count(self): return len(self.succeeded) @property def failed_count(self): return len(self.failed) @property def done_count(self): return len(self.done) @property def paused_count(self): return len(self.paused) @property def not_started(self): return self.get_children_by_status(['Not started']) @property def in_progress(self): return self.get_children_by_status(['In Progress']) @property def canceled(self): return self.get_children_by_status(['Canceled']) @property def succeeded(self): return self.get_children_by_status(['Succeeded']) @property def failed(self): return self.get_children_by_status(['Failed']) @property def done(self): return self.get_children_by_status(['Succeeded', 'Canceled', 'Failed']) @property def not_done(self): return self.get_children_by_status(['In Progress', 'Paused']) @property def paused(self): return self.get_children_by_status(['Paused']) @property def all_children(self): return self.get_children_by_status([]) @property def all_children_count(self): return len(self.all_children) def find_id(self, f): found = None for c in self.children: if c.id == f: found = c else: found = c.find_id(f) if found: break return found def find_friendly_id(self, f): found = None for c in self.children: if c.friendly_id == f: found = c else: found = c.find_friendly_id(f) if found: break return found def log_done(self): if not self.has_metric: logging.debug('No metric defined for {}'.format(self.id)) return self try: self.metric.seconds(MetricName=self.metric_name, Value=self.elapsed_time_in_seconds) self.metric.count( MetricName="{}/{}".format(self.metric_name, self.status)) except Exception as e: logging.warn('Error logging done metric: {}\n{}:{}'.format( str(e), self.metric_name, self.elapsed_time_in_seconds)) return self def mark_done(self, status, m=None): if self.is_done: logging.warning('Already done: {}'.self.id) return self if m: self.with_status_msg(m) self.with_finish_time(arrow.utcnow()) self.is_done = True self.is_in_progress = False self.status = status self.is_dirty = True self.log_done() return self def succeed(self, **kwargs): if self.status == 'Succeeeded' and self.is_done and \ not self.is_in_progress: logging.warning('Already succeeded {}'.self.id) return self m = kwargs.get('Message', None) self.mark_done('Succeeded', m) return self def cancel(self, **kwargs): if self.status == 'Canceled' and self.is_done and \ not self.is_in_progress: logging.warning('Already canceled: {}'.self.id) return self m = kwargs.get('Message', None) self.mark_done('Canceled', m) return self def fail(self, **kwargs): if self.status == 'Failed' and self.is_done and \ not self.is_in_progress: logging.warning('Already failed: {}'.self.id) return self m = kwargs.get('Message', None) self.mark_done('Failed', m) return self
def test_adding_dimension_adds_dimension(): test_name = 'test_name' test_value = 'test_value' m = FluentMetric() m.with_dimension(test_name, test_value) assert m.does_dimension_exist(test_name)
def test_setting_namespace_sets_namespace(): test_value = 'test_namespace' m = FluentMetric() m.with_namespace(test_value) assert m.namespace == test_value
def test_can_set_resolution(): m = FluentMetric().with_namespace('Performance').with_storage_resolution(1) assert m.storage_resolution == 1
def test_can_log_count(fm_log): m = FluentMetric().with_namespace('Performance') m.count(MetricName='test', Count=2) fm_log.assert_called()
def test_can_disable_stream_id(): m = FluentMetric(UseStreamId=False).with_namespace('Performance') assert len(m.dimensions) == 0
class APIMonitor: def __init__(self): self.enabled = False if 'api_monitor' in os.environ and int(os.environ['api_monitor']) == 1: self.enabled = True self.use_prometheus = False if 'api_monitor_prometheus' in os.environ and int( os.environ['api_monitor_prometheus']) == 1: self.use_prometheus = True if self.enabled: self.monitor_target = os.environ['db_name'].replace("_", "-") \ if 'db_name' in os.environ else 'trews;' if self.use_prometheus: self.prometheus = PrometheusMonitor() else: self._push_period_secs = int(os.environ['api_monitor_cw_push_period']) \ if 'api_monitor_cw_push_period' in os.environ else 60 # k8s pods have their pod name set as the hostname. stream_id = os.environ[ 'HOSTNAME'] if 'HOSTNAME' in os.environ else 'api-testing' self.cw_metrics = FluentMetric().with_namespace( 'OpsDX').with_stream_id(stream_id) # Latencies and request counters. self._counters = {} self._latencies = {} # General-purpose metrics. self._metrics = {} self._metric_specs = {} # Asynchronous stats uploads. async def start_monitor(self, app): if self.enabled and not self.use_prometheus: loop = asyncio.get_event_loop() self._push_handle = loop.call_later(self._push_period_secs, self._cw_flush, loop) return True async def stop_monitor(self, app): if self._push_handle: self._push_handle.cancel() return True # Returns a context manager for timing in a 'with' block. def time(self, name): if self.enabled: if self.use_prometheus: return self.prometheus.trews_api_request_latency.labels( self.monitor_target, name).time() else: return _CloudwatchTimer(self, name) else: return _NullContextManager() # Track a request served/processed. Internally increments a counter. def request(self, name, value=1): if self.enabled: if self.use_prometheus: self.prometheus.trews_api_request_counts.labels( self.monitor_target, name).inc() else: self._request(name, value) # Helpers. def _latency(self, name): if self.enabled: timer = self.cw_metrics.get_timer(name) duration_ms = timer.elapsed_in_ms() if name in self._latencies: self._latencies[name].append(duration_ms) else: self._latencies[name] = [duration_ms] def _request(self, name, value): if self.enabled: if name in self._counters: self._counters[name] += value else: self._counters[name] = value # Metrics. # TODO: Prometheus implementation def register_metric(self, metric_name, unit, dimensions): if self.enabled and not self.use_prometheus: self._metric_specs[metric_name] = { 'unit': unit, 'dimensions': dimensions } def add_metric(self, name, value=1): if self.enabled and not self.use_prometheus: if name in self._metrics: self._metrics[name] += value else: self._metrics[name] = value def append_metric(self, name, value=1): if self.enabled and not self.use_prometheus: if name in self._metrics: self._metrics[name].append(value) else: self._metrics[name] = [value] # Metrics upload. def _cw_flush(self, loop): if self.enabled: try: logging.info('Flushing CW metrics... %s %s' % (len(self._latencies), len(self._counters))) self.cw_metrics.with_dimension('API', self.monitor_target) for k, v in self._counters.items(): logging.info('Requests %s %s' % (k, str(v))) self.cw_metrics.with_dimension('Route', k) self.cw_metrics.count(MetricName='Requests', Count=v) for k, v in self._latencies.items(): self.cw_metrics.with_dimension('Route', k) l_cnt = float(len(v)) l_sum = float(functools.reduce(lambda acc, x: acc + x, v)) l_avg = l_sum / l_cnt if l_cnt > 0 else 0.0 logging.info('Latency %s %s %s %s' % (k, l_cnt, l_sum, l_avg)) self.cw_metrics.count(MetricName='LatencyCount', Count=l_cnt) \ .log(MetricName='LatencySum', Value=l_sum, Unit='Milliseconds') \ .log(MetricName='LatencyAvg', Value=l_avg, Unit='Milliseconds') self.cw_metrics.without_dimension('Route') self.cw_metrics.without_dimension('API') for k, v in self._metrics.items(): unit = self._metric_specs.get(k, {}).get('unit', 'None') dimensions = self._metric_specs.get(k, {}).get( 'dimensions', []) self.cw_metrics.push_dimensions() for dn, dv in dimensions: self.cw_metrics.with_dimension(dn, dv) if isinstance(v, Number): logging.info('NMetric %s %s' % (k, v)) self.cw_metrics.log(MetricName=k, Value=v, Unit=unit) elif isinstance(v, list): v_cnt = float(len(v)) v_sum = float( functools.reduce(lambda acc, x: acc + x, v)) v_avg = v_sum / v_cnt if v_cnt > 0 else 0.0 logging.info('LMetric %s %s %s %s' % (k, v_cnt, v_sum, v_avg)) self.cw_metrics.count(MetricName='%sCount' % k, Count=v_cnt) \ .log(MetricName='%sSum' % k, Value=v_sum, Unit=unit) \ .log(MetricName='%sAvg' % k, Value=v_avg, Unit=unit) self.cw_metrics.pop_dimensions() self._metrics = {} self._counters = {} self._latencies = {} # Schedule the next flush. self._push_handle = loop.call_later(self._push_period_secs, self._cw_flush, loop) except Exception as e: logging.error(str(e)) traceback.print_exc()