def update_indicator_values(indicator: Indicator, start: int, end=None, **kwargs): """Query and update indicator values""" current_span = extract_span_from_kwargs(**kwargs) session = db.session result = query_sli(indicator.name, indicator.source, start, end) if result: insert_span = opentracing.tracer.start_span( operation_name='insert_indicator_values', child_of=current_span) (insert_span.set_tag('indicator', indicator.name).set_tag('indicator_id', indicator.id)) insert_span.log_kv({'result_count': len(result)}) with insert_span: for minute, val in result.items(): if val > 0: val = max(val, MIN_VAL) elif val < 0: val = min(val, MIN_VAL * -1) iv = IndicatorValue(timestamp=minute, value=val, indicator_id=indicator.id) insert_indicator_value(session, iv) session.commit() return len(result)
def update_dashboard(self, dashboard: dict, **kwargs) -> dict: """ Create or update dashboard. If dashboard has an ``id`` then dashboard will be updated, otherwise a new dashboard is created. :param dashboard: ZMON dashboard dict. :type dashboard: int, str :return: Dashboard dict. :rtype: dict """ current_span = extract_span_from_kwargs(**kwargs) if 'id' in dashboard and dashboard['id']: logger.debug('Updating dashboard with ID: {} ...'.format(dashboard['id'])) current_span.set_tag('dashboard_id', dashboard['id']) resp = self.session.post(self.endpoint(DASHBOARD, dashboard['id']), json=dashboard, timeout=self._timeout) else: # new dashboard logger.debug('Adding new dashboard ...') resp = self.session.post(self.endpoint(DASHBOARD), json=dashboard, timeout=self._timeout) resp.raise_for_status() return self.json(resp)
def create_alert_definition(self, alert_definition: dict, **kwargs) -> dict: """ Create new alert definition. Attributes ``last_modified_by`` and ``check_definition_id`` are required. If ``status`` is not set, then it will be set to ``ACTIVE``. :param alert_definition: ZMON alert definition dict. :type alert_definition: dict :return: Alert definition dict. :rtype: dict """ current_span = extract_span_from_kwargs(**kwargs) if 'last_modified_by' not in alert_definition: current_span.set_tag('error', True) current_span.log_kv({'exception': 'Alert definition must have "last_modified_by"'}) raise ZmonArgumentError('Alert definition must have "last_modified_by"') if 'status' not in alert_definition: alert_definition['status'] = 'ACTIVE' if 'check_definition_id' not in alert_definition: current_span.set_tag('error', True) current_span.log_kv({'exception': 'Alert definition must have "last_modified_by"'}) raise ZmonArgumentError('Alert defintion must have "check_definition_id"') current_span.set_tag('check_id', alert_definition['check_definition_id']) resp = self.session.post(self.endpoint(ALERT_DEF), json=alert_definition, timeout=self._timeout) return self.json(resp)
def get_all_stack_names(cf, **kwargs): stacks = [] current_span = extract_span_from_kwargs(**kwargs) paginator = cf.get_paginator('list_stacks') try: response_iterator = call_and_retry( lambda: paginator.paginate(StackStatusFilter=STACK_STATUS_FILTER)) for page in response_iterator: summaries = page['StackSummaries'] for summary in summaries: stacks.append(summary['StackName']) current_span.log_kv({"num_stacks": len(stacks)}) except Exception as e: if isinstance( e, ClientError) and e.response['Error']['Code'] == 'AccessDenied': msg = 'Access to AWS CloudFormation denied. You may need the cloudformation:ListStacks permission' logger.warning(msg) current_span.log_kv({'message': msg}) current_span.set_tag('access_denied', True) else: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to retrieve stack names') return stacks
def get_alert_data(self, alert_id: int, **kwargs) -> dict: """ Retrieve alert data. Response is a ``dict`` with entity ID as a key, and check return value as a value. :param alert_id: ZMON alert ID. :type alert_id: int :return: Alert data dict. :rtype: dict Example: .. code-block:: json { "entity-id-1": 122, "entity-id-2": 0, "entity-id-3": 100 } """ current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('alert_id', str(alert_id)) resp = self.session.get(self.endpoint(ALERT_DATA, alert_id, 'all-entities'), timeout=self._timeout) return self.json(resp)
def get_cluster_ingresses(kube_client, cluster_id, alias, environment, region, infrastructure_account, namespace='default', **kwargs) -> list: current_span = extract_span_from_kwargs(**kwargs) entities = [] ingresses = get_all(kube_client, kube_client.get_ingresses, namespace, span=current_span) for ingress in ingresses: obj = ingress.obj entity = { 'id': 'ingress-{}-{}[{}]'.format(ingress.name, ingress.namespace, cluster_id), 'type': INGRESS_TYPE, 'kube_cluster': cluster_id, 'alias': alias, 'environment': environment, 'created_by': AGENT_TYPE, 'infrastructure_account': infrastructure_account, 'region': region, 'ingress_name': ingress.name, 'ingress_namespace': ingress.namespace, 'ingress_rules': obj['spec'].get('rules', []) } entity.update(entity_labels(obj, 'labels')) entities.append(entity) return entities
def get_cluster_namespaces( kube_client, cluster_id, alias, environment, region, infrastructure_account, namespace=None, **kwargs) -> list: current_span = extract_span_from_kwargs(**kwargs) # noqa entities = [] for ns in kube_client.get_namespaces(): obj = ns.obj if namespace and namespace != ns.name: continue entity = { 'id': 'namespace-{}[{}]'.format(ns.name, cluster_id), 'type': NAMESPACE_TYPE, 'kube_cluster': cluster_id, 'alias': alias, 'environment': environment, 'created_by': AGENT_TYPE, 'infrastructure_account': infrastructure_account, 'region': region, 'namespace_name': ns.name, } entity.update(entity_labels(obj, 'labels', 'annotations')) entities.append(entity) return entities
def update_grafana_dashboard(self, grafana_dashboard: dict, **kwargs) -> dict: """ Update existing Grafana dashboard. Atrributes ``uid`` and ``title`` are required. :param grafana_dashboard: Grafana dashboard dict. :type grafana_dashboard: dict :return: Grafana dashboard dict. :rtype: dict """ current_span = extract_span_from_kwargs(**kwargs) if 'uid' not in grafana_dashboard['dashboard']: current_span.set_tag('error', True) current_span.log_kv({'exception': 'Grafana dashboard must have "uid". Use Grafana6 dashboard format.'}) raise ZmonArgumentError('Grafana dashboard must have "uid". Hint: Use Grafana6 dashboard format.') elif 'title' not in grafana_dashboard['dashboard']: current_span.set_tag('error', True) current_span.log_kv({'exception': 'Grafana dashboard must have "title"'}) raise ZmonArgumentError('Grafana dashboard must have "title"') current_span.set_tag('grafana_dashboard_uid', grafana_dashboard['dashboard']['uid']) if 'id' in grafana_dashboard['dashboard'] and grafana_dashboard['dashboard']['id'] is not None: current_span.set_tag('grafana_dashboard_id', grafana_dashboard['dashboard']['id']) resp = self.session.post(self.endpoint(GRAFANA), json=json.dumps(grafana_dashboard), timeout=self._timeout) return self.json(resp)
def remove_missing_entities(existing_ids, current_ids, zmon_client, dry_run=False, **kwargs): current_span = extract_span_from_kwargs(**kwargs) to_be_removed_ids = list(set(existing_ids) - set(current_ids)) error_count = 0 if not dry_run: logger.info('Removing {} entities from ZMON'.format( len(to_be_removed_ids))) for entity_id in to_be_removed_ids: logger.info('Removing entity with id: {}'.format(entity_id)) try: deleted = zmon_client.delete_entity(entity_id) if not deleted: current_span.set_tag('error', True) logger.info('Failed to delete entity!') error_count += 1 except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) return to_be_removed_ids, error_count
def add_entity(self, entity: dict, **kwargs) -> requests.Response: """ Create or update an entity on ZMON. .. note:: ZMON PUT entity API doesn't return JSON response. :param entity: Entity dict. :type entity: dict :return: Response object. :rtype: :class:`requests.Response` """ if 'id' not in entity or 'type' not in entity: raise ZmonArgumentError('Entity "id" and "type" are required.') if not self.is_valid_entity_id(entity['id']): raise ZmonArgumentError('Invalid entity ID.') logger.debug('Adding new entity: {} ...'.format(entity['id'])) current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('entity_id', entity['id']) data = json.dumps(entity, cls=JSONDateEncoder) resp = self.session.put(self.endpoint(ENTITIES, trailing_slash=False), data=data, timeout=self._timeout) resp.raise_for_status() return resp
def update_check_definition(self, check_definition, skip_validation=False, **kwargs) -> dict: """ Update existing check definition. Atrribute ``owning_team`` is required. If ``status`` is not set, then it will be set to ``ACTIVE``. :param check_definition: ZMON check definition dict. :type check_definition: dict :param skip_validation: Skip validation of the check command syntax. :type skip_validation: bool :return: Check definition dict. :rtype: dict """ current_span = extract_span_from_kwargs(**kwargs) if 'owning_team' not in check_definition: current_span.set_tag('error', True) current_span.log_kv({'exception': 'Check definition must have "owning_team"'}) raise ZmonArgumentError('Check definition must have "owning_team"') if 'status' not in check_definition: check_definition['status'] = 'ACTIVE' if not skip_validation: try: self.validate_check_command(check_definition['command']) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) raise resp = self.session.post(self.endpoint(CHECK_DEF), json=check_definition, timeout=self._timeout) return self.json(resp)
def get_entities(self, query=None, **kwargs) -> list: """ Get ZMON entities, with optional filtering. :param query: Entity filtering query. Default is ``None``. Example query ``{'type': 'instance'}`` to return all entities of type: ``instance``. :type query: dict :return: List of entities. :rtype: list """ query_str = json.dumps(query) if query else '' logger.debug( 'Retrieving entities with query: {} ...'.format(query_str)) current_span = extract_span_from_kwargs(**kwargs) current_span.log_kv({'query', query_str}) params = {'query': query_str} if query else None resp = self.session.get(self.endpoint(ENTITIES), params=params, timeout=self._timeout) return self.json(resp)
def add_new_entities(all_current_entities, existing_entities, zmon_client, dry_run=False, **kwargs): current_span = extract_span_from_kwargs(**kwargs) existing_entities_dict = {e['id']: e for e in existing_entities} new_entities = [ e for e in all_current_entities if new_or_updated_entity(e, existing_entities_dict) ] error_count = 0 if not dry_run: logger.info( 'Found {} new or updated entities to be added in ZMON'.format( len(new_entities))) for entity in new_entities: logger.info('Adding new or updated {} entity with ID: {}'.format( entity['type'], entity['id'])) try: resp = zmon_client.add_entity(entity) resp.raise_for_status() except Exception: current_span.set_tag('error', True) logger.exception('Failed to add entity!') current_span.log_kv({ 'exception': traceback.format_exc(), "entity": entity }) error_count += 1 return new_entities, error_count
def list_postgres_databases(*args, **kwargs): try: query = """ SELECT datname FROM pg_database WHERE datname NOT IN('postgres', 'template0', 'template1') """ current_span = extract_span_from_kwargs(**kwargs) kwargs = clean_opentracing_span(**kwargs) current_span.set_tag(ot_tags.PEER_ADDRESS, 'psql://{}:{}'.format(kwargs.get('host'), kwargs.get('port'))) current_span.set_tag(ot_tags.DATABASE_INSTANCE, kwargs.get('dbname')) current_span.set_tag(ot_tags.DATABASE_STATEMENT, query) kwargs.update({'connect_timeout': POSTGRESQL_CONNECT_TIMEOUT}) conn = psycopg2.connect(*args, **kwargs) cur = conn.cursor() cur.execute(query) return [row[0] for row in cur.fetchall()] except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to list DBs!') return []
def list_postgres_databases(*args, **kwargs): try: query = """ SELECT datname FROM pg_database WHERE datname NOT IN('postgres', 'template0', 'template1') """ current_span = extract_span_from_kwargs(**kwargs) kwargs = clean_opentracing_span(**kwargs) current_span.set_tag( ot_tags.PEER_ADDRESS, 'psql://{}:{}'.format(kwargs.get('host'), kwargs.get('port'))) current_span.set_tag(ot_tags.DATABASE_INSTANCE, kwargs.get('dbname')) current_span.set_tag(ot_tags.DATABASE_STATEMENT, query) kwargs.update({'connect_timeout': POSTGRESQL_CONNECT_TIMEOUT}) conn = psycopg2.connect(*args, **kwargs) cur = conn.cursor() cur.execute(query) return [row[0] for row in cur.fetchall()] except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to list DBs!') return []
def get_auto_scaling_groups(region, acc, **kwargs): groups = [] as_client = boto3.client('autoscaling', region_name=region) ec2_client = boto3.client('ec2', region_name=region) paginator = as_client.get_paginator('describe_auto_scaling_groups') asgs = call_and_retry( lambda: paginator.paginate(PaginationConfig={'MaxItems': MAX_PAGE}).build_full_result()['AutoScalingGroups']) for g in asgs: sg = { 'id': entity_id('asg-{}[{}:{}]'.format(g['AutoScalingGroupName'], acc, region)), 'type': 'asg', 'infrastructure_account': acc, 'region': region, 'created_by': 'agent', 'name': g['AutoScalingGroupName'], 'availability_zones': g['AvailabilityZones'], 'desired_capacity': g['DesiredCapacity'], 'max_size': g['MaxSize'], 'min_size': g['MinSize'], 'created_time': g['CreatedTime'].strftime('%Y-%m-%d %H:%M:%S.%f'), } assign_properties_from_tags(sg, g.get('Tags', [])) add_traffic_tags_to_entity(sg) sg['instances'] = [] instance_ids = [i['InstanceId'] for i in g['Instances'] if i['LifecycleState'] == 'InService'] # # Avoid describing instances when there's nothing to filter # for: that would claim *every* instance in the account. # if instance_ids: ec2_paginator = ec2_client.get_paginator('describe_instances') try: reservations = call_and_retry( lambda: ec2_paginator.paginate(InstanceIds=instance_ids).build_full_result()['Reservations']) for r in reservations: for i in r['Instances']: if 'PrivateIpAddress' in i: sg['instances'].append({ 'aws_id': i['InstanceId'], 'ip': i['PrivateIpAddress'], }) except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed in retrieving instances for ASG: {}'.format(sg['name'])) groups.append(sg) return groups
def parent(**kwargs): assert is_span_in_kwargs(**kwargs) is pass_span if pass_span: current_span = extract_span_from_kwargs(**kwargs) assert current_span.operation_name == 'parent' nested()
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) url = cls._config.get('notifications.hipchat.url') token = kwargs.get('token', cls._config.get('notifications.hipchat.token')) repeat = kwargs.get('repeat', 0) notify = kwargs.get('notify', False) alert_def = alert['alert_def'] message_format = kwargs.get('message_format', 'html') current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) current_span.log_kv({'room': kwargs.get('room')}) color = 'green' if alert and not alert.get('is_alert') else kwargs.get('color', 'red') message_text = cls._get_subject(alert, custom_message=kwargs.get('message')) if kwargs.get('link', False): zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) alert_id = alert['alert_def']['id'] alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '' link_text = kwargs.get('link_text', 'go to alert') if message_format == 'html': message_text += ' -- <a href="{}" target="_blank">{}</a>'.format(alert_url, link_text) else: message_text += ' -- {} - {}'.format(link_text, alert_url) message = { 'message': message_text, 'color': color, 'notify': notify, 'message_format': message_format } try: logger.info( 'Sending to: ' + '{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']), token) + ' ' + json.dumps(message)) r = requests.post( '{}/v2/room/{}/notification'.format(url, urllib.quote(kwargs['room'])), json=message, params={'auth_token': token}, headers={'Content-type': 'application/json'}) r.raise_for_status() except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Hipchat write failed!') return repeat
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) url = cls._config.get('notifications.hipchat.url') token = kwargs.get('token', cls._config.get('notifications.hipchat.token')) repeat = kwargs.get('repeat', 0) notify = kwargs.get('notify', False) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) current_span.log_kv({'room': kwargs.get('room')}) color = 'green' if alert and not alert.get('is_alert') else kwargs.get( 'color', 'red') message_text = cls._get_subject(alert, custom_message=kwargs.get('message')) if kwargs.get('link', False): zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) alert_id = alert['alert_def']['id'] alert_url = urlparse.urljoin( zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '' link_text = kwargs.get('link_text', 'go to alert') message_text += ' -- <a href="{}" target="_blank">{}</a>'.format( alert_url, link_text) message = {'message': message_text, 'color': color, 'notify': notify} try: logger.info('Sending to: ' + '{}/v2/room/{}/notification?auth_token={}'.format( url, urllib.quote(kwargs['room']), token) + ' ' + json.dumps(message)) r = requests.post('{}/v2/room/{}/notification'.format( url, urllib.quote(kwargs['room'])), json=message, params={'auth_token': token}, headers={'Content-type': 'application/json'}) r.raise_for_status() except Exception as e: current_span.set_tag('error', True) current_span.log_kv({'exception': str(e)}) logger.exception('Hipchat write failed!') return repeat
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) repeat = kwargs.get('repeat', 0) oauth2 = kwargs.get('oauth2', True) headers = {'Content-type': 'application/json'} timeout = 5 alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) url = cls._config.get('notifications.service.url', None) if not url: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'No notification service url set!'}) logger.error('No notification service url set') return repeat url = url + '/api/v1/twilio' if oauth2: headers.update({'Authorization': 'Bearer {}'.format(tokens.get('uid'))}) else: key = kwargs.get('key', cls._config.get('notifications.service.key')) headers.update({'Authorization': 'Bearer {}'.format(key)}) headers['User-Agent'] = get_user_agent() data = { 'message': kwargs.get('message', cls._get_subject(alert)), 'escalation_team': kwargs.get('team', alert['alert_def'].get('team', '')), 'numbers': kwargs.get('numbers', []), 'voice': kwargs.get('voice', 'woman'), 'alert_id': alert['alert_def']['id'], 'entity_id': alert['entity']['id'], 'event_type': 'ALERT_ENDED' if alert and not alert.get('is_alert') else 'ALERT_START', 'alert_changed': alert.get('alert_changed', False), } try: logger.info('Sending HTTP POST request to {}'.format(url)) r = requests.post(url, data=json.dumps(data, cls=JsonDataEncoder), headers=headers, timeout=timeout) r.raise_for_status() except Exception: logger.exception('Twilio Request failed!') return repeat
def save_object(self, obj: Target, **kwargs) -> Target: current_span = extract_span_from_kwargs(**kwargs) current_span.log_kv({'objective_id': obj.objective_id}) current_span.log_kv({'indicator_id': obj.indicator_id}) db.session.add(obj) db.session.commit() return obj
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) provider_url = cls._config.get('notifications.sms.provider_url', SMS_PROVIDER_URL) phone_numbers = BaseNotification.resolve_group(args, phone=True) repeat = kwargs.get('repeat', 0) maxlen = cls._config.get('notifications.sms.maxlength', SMS_MAXLENGTH) message = cls._get_subject( alert, custom_message=kwargs.get('message'))[:maxlen] request_params = { 'to': '', 'key': cls._config['notifications.sms.apikey'], 'from': cls._config.get('notifications.sms.sender', SMS_SENDER), 'route': cls._config.get('notifications.sms.route', SMS_ROUTE), 'message': message, 'cost': 1, 'message_id': 1, } try: if cls._config.get('notifications.sms.on', True): for phone in phone_numbers: request_params['to'] = phone r = requests.get(provider_url, params=request_params, verify=False) url_secured = r.url.replace( request_params['key'], '*' * len(request_params['key'])) logger.info( 'SMS sent: request to %s --> status: %s, response headers: %s, response body: %s', url_secured, r.status_code, r.headers, r.text) r.raise_for_status() except Exception as e: current_span.set_tag('error', True) current_span.log_kv({'exception': str(e)}) logger.exception( 'Failed to send sms for alert %s with id %s to: %s', alert_def['name'], alert_def['id'], list(phone_numbers)) finally: return repeat
def get_account_id(region, **kwargs): try: iam_client = boto3.client('iam', region_name=region) role = iam_client.list_roles()['Roles'][0] return role['Arn'].split(':')[4] except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) return None
def get_account_id(region, **kwargs): try: iam_client = boto3.client('iam', region_name=region) role = iam_client.list_roles()['Roles'][0] return role['Arn'].split(':')[4] except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) return None
def update_local_entity(zmon_client, entity, **kwargs): current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('entity_type', 'local') current_span.set_tag('entity_id', entity['id']) try: zmon_client.add_entity(entity) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to add Local entity: {}'.format(entity))
def get_account_alias(region, **kwargs): try: iam_client = boto3.client('iam', region_name=region) resp = iam_client.list_account_aliases() return resp['AccountAliases'][0] except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) return None
def update_local_entity(zmon_client, entity, **kwargs): current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('entity_type', 'local') current_span.set_tag('entity_id', entity['id']) try: zmon_client.add_entity(entity) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to add Local entity: {}'.format(entity))
def get_account_alias(region, **kwargs): try: iam_client = boto3.client('iam', region_name=region) resp = iam_client.list_account_aliases() return resp['AccountAliases'][0] except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) return None
def get_certificates(region, acc, **kwargs): iam_client = boto3.client('iam', region_name=region) acm_client = boto3.client('acm', region_name=region) entities = [] try: server_certs = iam_client.list_server_certificates()['ServerCertificateMetadataList'] acm_certs = acm_client.list_certificates()['CertificateSummaryList'] for cert in server_certs: e = { 'id': entity_id('cert-iam-{}[{}:{}]'.format(cert['ServerCertificateName'], acc, region)), 'type': 'certificate', 'infrastructure_account': acc, 'region': region, 'created_by': 'agent', 'certificate_type': 'iam', 'name': cert['ServerCertificateName'], 'arn': cert['Arn'], 'status': 'ISSUED', 'expiration': cert['Expiration'].isoformat(), 'in_use': True # The results do not contain reference to existing use of certs } entities.append(e) for cert in acm_certs: c = acm_client.describe_certificate(CertificateArn=cert['CertificateArn'])['Certificate'] cert_id = cert['CertificateArn'].split('/')[-1] e = { 'id': entity_id('cert-acm-{}-{}[{}:{}]'.format(cert_id, c['DomainName'], acc, region)), 'type': 'certificate', 'infrastructure_account': acc, 'region': region, 'created_by': 'agent', 'certificate_type': 'acm', 'name': c['DomainName'], 'arn': c['CertificateArn'], 'status': c['Status'], 'expiration': c['NotAfter'].isoformat() if 'NotAfter' in c else '', 'in_use': len(c['InUseBy']) > 0, } entities.append(e) except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed while retrieving IAM/ACM certificates, IAM role has no access?') return entities
def get_rds_instances(region, acc, existing_entities, **kwargs): entities = [] now = datetime.now() rds_entities = [r for r in existing_entities if r['type'] == 'database' and r['id'].startswith('rds-')] if now.minute % 15: return rds_entities try: rds_client = boto3.client('rds', region_name=region) paginator = rds_client.get_paginator('describe_db_instances') instances = call_and_retry(lambda: paginator.paginate( PaginationConfig={'MaxItems': MAX_PAGE}).build_full_result()) for i in instances['DBInstances']: db = { 'id': entity_id('rds-{}[{}]'.format(i['DBInstanceIdentifier'], acc)), 'created_by': 'agent', 'infrastructure_account': '{}'.format(acc), 'region': region, 'type': 'database', 'engine': i['Engine'], 'port': i['Endpoint']['Port'], 'host': i['Endpoint']['Address'], 'name': i['DBInstanceIdentifier'], 'instance_type': i.get('DBInstanceClass', ''), 'storage_type': i.get('StorageType', ''), 'storage_size': i.get('AllocatedStorage', ''), } if 'EngineVersion' in i: db['version'] = i['EngineVersion'] cluster_name = db['name'] if i.get('DBName'): cluster_name = i['DBName'] db['shards'] = {cluster_name: '{}:{}/{}'.format(db['host'], db['port'], cluster_name)} entities.append(db) except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to get RDS instance') return entities
def add_entity(zmon_client, entity, **kwargs): current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('entity_type', entity['type']) current_span.set_tag('entity_id', entity['id']) try: logger.info('Adding new {} entity with ID: {}'.format(entity['type'], entity['id'])) zmon_client.add_entity(entity) return 0 except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to add entity: {}'.format(entity)) return 1
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) url = kwargs.get('url', cls._config.get('notifications.push.url')) key = kwargs.get('key', cls._config.get('notifications.push.key')) if url is None or not url: return 0 repeat = kwargs.get('repeat', 0) message = { "notification": { "icon": 'clean.png' if alert and not alert.get('is_alert') else 'warning.png', "title": kwargs.get("message", cls._get_expanded_alert_name(alert)), "body": kwargs.get("body", formatEntity(alert["entity"]["id"])), "alert_changed": alert.get('alert_changed', False), "click_action": kwargs.get("click_action", "/#/alert-details/{}".format(alert["alert_def"]["id"])), "collapse_key": kwargs.get("collapse_key", "{}:{}".format(alert['alert_def']['id'], alert['entity']['id'])) }, "alert_id": alert['alert_def']['id'], "entity_id": alert['entity']['id'], "team": kwargs.get('team', alert['alert_def'].get('team', '')), "priority": alert["alert_def"]["priority"] } url = url + '/api/v1/publish' try: # logger.info("Sending push notification to %s %s", url, message) r = requests.post(url, headers={"Authorization": "PreShared " + key, 'Content-Type': 'application/json'}, data=json.dumps(message)) r.raise_for_status() except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) return repeat
def get_dynamodb_tables(region, acc, **kwargs): tables = [] # catch exception here, original agent policy does not allow scanning dynamodb try: ddb = boto3.client('dynamodb', region_name=region) paginator = ddb.get_paginator('list_tables') ts = call_and_retry(lambda: paginator.paginate(PaginationConfig={ 'MaxItems': MAX_PAGE }).build_full_result()['TableNames']) tables = [] for tn in ts: t = call_and_retry(ddb.describe_table, TableName=tn)['Table'] if t['TableStatus'] not in ['ACTIVE', 'UPDATING']: continue table = { 'id': entity_id('dynamodb-{}[{}:{}]'.format(t['TableName'], acc, region)), 'region': region, 'created_by': 'agent', 'infrastructure_account': '{}'.format(acc), 'type': 'dynamodb', 'name': '{}'.format(t['TableName']), 'arn': '{}'.format(t['TableArn']) } tables.append(table) except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.log_kv({ 'exception': 'Got exception while listing dynamodb tables, IAM role has no access?' }) logger.exception( 'Got exception while listing dynamodb tables, IAM role has no access?' ) return tables
def get_elastigroup_resources(cf, stack_name, **kwargs): """ Extracts the Elastigroups from existing stacks, including the respective API access tokens and cloud account IDs It returns those parameters from the resource of Type ``Custom::elastigroup`` found in the stack with the name provided as arguments """ groups = [] current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('stack_name', stack_name) paginator = cf.get_paginator('list_stack_resources') try: resources = call_and_retry(lambda: paginator.paginate( PaginationConfig={ 'MaxItems': MAX_PAGE }, StackName=stack_name).build_full_result()[ 'StackResourceSummaries']) for resource in resources: elastigroups = [] if resource['ResourceType'] == ELASTIGROUP_RESOURCE_TYPE: elastigroups.append(resource) if elastigroups: resources = cf.get_template( StackName=stack_name)['TemplateBody']['Resources'] for elastigroup in elastigroups: group_id = elastigroup["PhysicalResourceId"] group_name = elastigroup["LogicalResourceId"] spotinst_token = resources[group_name]['Properties'][ 'accessToken'] spotinst_account_id = resources[group_name]['Properties'][ 'accountId'] groups.append( Elastigroup(group_id, group_name, spotinst_account_id, spotinst_token)) except Exception as e: if isinstance( e, ClientError) and e.response['Error']['Code'] == 'AccessDenied': msg = 'Access to AWS API denied. You may need the cloudformation:ListStackResources and ' \ 'cloudformation:GetTemplate permissions' logger.warning(msg) current_span.log_kv({'message': msg}) current_span.set_tag('access_denied', True) else: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception( 'Failed to retrieve Elastigroup resources from Stack "{}"'. format(stack_name)) return groups
def get_instance_events(aws_client, instance, **kwargs): try: instance_status_resp = call_and_retry(aws_client.describe_instance_status, InstanceIds=[instance['InstanceId']]) if 'Events' in instance_status_resp['InstanceStatuses'][0]: return instance_status_resp['InstanceStatuses'][0]['Events'] except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to retrieve instance events for instance: {}'.format(instance['InstanceId'])) return []
def create_downtime(self, downtime: dict, **kwargs) -> dict: """ Create a downtime for specific entities. Atrributes ``entities`` list, ``start_time`` and ``end_time`` timestamps are required. :param downtime: Downtime dict. :type downtime: dict :return: Downtime dict. :rtype: dict Example downtime: .. code-block:: json { "entities": ["entity-id-1", "entity-id-2"], "comment": "Planned maintenance", "start_time": 1473337437.312921, "end_time": 1473341037.312921, } """ current_span = extract_span_from_kwargs(**kwargs) if not downtime.get('entities'): current_span.set_tag('error', True) current_span.log_kv( {'exception': 'At least one entity ID should be specified'}) raise ZmonArgumentError( 'At least one entity ID should be specified') if not downtime.get('start_time') or not downtime.get('end_time'): current_span.set_tag('error', True) current_span.log_kv({ 'exception': 'Downtime must specify "start_time" and "end_time"' }) raise ZmonArgumentError( 'Downtime must specify "start_time" and "end_time"') current_span.set_tag('entity_ids', str(downtime.get('entities'))) # FIXME - those also? # current_span.set_tag('start_time', str(downtime.get('start_time'))) # current_span.set_tag('end_time', str(downtime.get('end_time'))) resp = self.session.post(self.endpoint(DOWNTIME), json=downtime, timeout=self._timeout) return self.json(resp)
def add_entity(zmon_client, entity, **kwargs): current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('entity_type', entity['type']) current_span.set_tag('entity_id', entity['id']) try: logger.info('Adding new {} entity with ID: {}'.format( entity['type'], entity['id'])) zmon_client.add_entity(entity) return 0 except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to add entity: {}'.format(entity)) return 1
def get(cls, **kwargs) -> dict: current_span = extract_span_from_kwargs(**kwargs) report_type = kwargs.get('report_type') if report_type not in REPORT_TYPES: raise ProblemException( status=404, title='Resource not found', detail='Report type ({}) is invalid. Supported types are: {}'. format(report_type, REPORT_TYPES)) product_id = kwargs.get('product_id') product = Product.query.get_or_404(product_id) objectives = product.objectives.all() now = datetime.utcnow() start = now - relativedelta(days=7) if report_type != 'weekly': months = 1 if report_type == 'monthly' else 3 start = now - relativedelta(months=months) unit = 'day' if report_type == 'weekly' else 'week' current_span.set_tag('report_type', report_type) current_span.set_tag('product_id', product_id) current_span.set_tag('product', product.name) current_span.set_tag('product_slug', product.slug) current_span.set_tag('product_group', product.product_group.name) current_span.log_kv({ 'report_duration_start': start, 'report_duration_end': now }) slo = get_report_summary(objectives, unit, start, now, current_span) current_span.log_kv({ 'report_objective_count': len(slo), 'objective_count': len(objectives) }) return { 'product_name': product.name, 'product_slug': product.slug, 'product_group_name': product.product_group.name, 'product_group_slug': product.product_group.slug, 'department': product.product_group.department, 'slo': slo, }
def delete_alert_definition(self, alert_definition_id: int, **kwargs) -> dict: """ Delete existing alert definition. :param alert_definition_id: ZMON alert definition ID. :type alert_definition_id: int :return: Alert definition dict. :rtype: dict """ current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('alert_id', str(alert_definition_id)) resp = self.session.delete(self.endpoint(ALERT_DEF, alert_definition_id)) return self.json(resp)
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) url = kwargs.get('webhook', cls._config.get('notifications.slack.webhook')) repeat = kwargs.get('repeat', 0) current_span.log_kv({'channel': kwargs.get('channel')}) if not url: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'Missing webhook!'}) raise NotificationError('Webhook is required!') message = { 'username': '******', 'channel': kwargs.get('channel', '#general'), 'text': kwargs.get('message', cls._get_subject(alert)), 'icon_emoji': ':bar_chart:', } headers = { 'User-agent': get_user_agent(), 'Content-type': 'application/json', } try: logger.info('Sending to %s %s', url, message) r = requests.post(url, json=message, headers=headers, timeout=5) r.raise_for_status() except Exception as e: current_span.set_tag('error', True) current_span.log_kv({'exception': str(e)}) logger.exception('Slack notification failed!') return repeat
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) provider_url = cls._config.get('notifications.sms.provider_url', SMS_PROVIDER_URL) phone_numbers = BaseNotification.resolve_group(args, phone=True) repeat = kwargs.get('repeat', 0) maxlen = cls._config.get('notifications.sms.maxlength', SMS_MAXLENGTH) message = cls._get_subject(alert, custom_message=kwargs.get('message'))[:maxlen] request_params = { 'to': '', 'key': cls._config['notifications.sms.apikey'], 'from': cls._config.get('notifications.sms.sender', SMS_SENDER), 'route': cls._config.get('notifications.sms.route', SMS_ROUTE), 'message': message, 'cost': 1, 'message_id': 1, } try: if cls._config.get('notifications.sms.on', True): for phone in phone_numbers: request_params['to'] = phone r = requests.get(provider_url, params=request_params, verify=False) url_secured = r.url.replace(request_params['key'], '*' * len(request_params['key'])) logger.info('SMS sent: request to %s --> status: %s, response headers: %s, response body: %s', url_secured, r.status_code, r.headers, r.text) r.raise_for_status() except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to send sms for alert %s with id %s to: %s', alert_def['name'], alert_def['id'], list(phone_numbers)) finally: return repeat
def get_cluster_statefulsets(kube_client, cluster_id, alias, environment, region, infrastructure_account, namespace='default', **kwargs) -> list: current_span = extract_span_from_kwargs(**kwargs) entities = [] statefulsets = get_all(kube_client, kube_client.get_statefulsets, namespace, span=current_span) for statefulset in statefulsets: obj = statefulset.obj # Stale replic set?! if obj['spec'].get('replicas', 0) == 0: continue containers = obj['spec'].get('template', {}).get('spec', {}).get('containers', []) entity = { 'id': 'statefulset-{}-{}[{}]'.format(statefulset.name, statefulset.namespace, cluster_id), 'type': STATEFULSET_TYPE, 'kube_cluster': cluster_id, 'alias': alias, 'environment': environment, 'created_by': AGENT_TYPE, 'infrastructure_account': infrastructure_account, 'region': region, 'statefulset_name': statefulset.name, 'statefulset_namespace': obj['metadata']['namespace'], 'statefulset_service_name': obj['spec']['serviceName'], 'volume_claims': { v['metadata']['name']: v['status'].get('phase', 'UNKNOWN') for v in obj['spec'].get('volumeClaimTemplates', []) }, 'containers': {c['name']: c.get('image', '') for c in containers if 'name' in c}, 'replicas': obj['spec'].get('replicas'), 'replicas_status': obj['status'].get('replicas'), 'actual_replicas': obj['status'].get('readyReplicas'), 'version': obj['metadata'].get('labels', {}).get('version', '') } entity.update(entity_labels(obj, 'labels', 'annotations')) entities.append(entity) return entities
def remove_entity(zmon_client, entity_id, **kwargs): current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('entity_id', entity_id) try: logger.info('Removing entity with id: {}'.format(entity_id)) deleted = zmon_client.delete_entity(entity_id) if not deleted: logger.error('Failed to delete entity!') return 1 except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Exception while deleting entity: {}'.format(entity_id)) return 1 return 0
def notify(cls, alert, queue, hubot_url, message=None, repeat=0, **kwargs): current_span = extract_span_from_kwargs(**kwargs) message = cls._get_subject(alert, custom_message=message) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) if '?' in hubot_url: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'Invalid URL!'}) raise ValueError post_params = { 'event': queue, 'data': message, } try: r = requests.post(hubot_url, data=post_params) r.raise_for_status() logger.info('Notification sent: request to %s --> status: %s, response headers: %s, response body: %s', hubot_url, r.status_code, r.headers, r.text) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception( 'Failed to send notification for alert %s with id %s to: %s', alert_def['name'], alert_def['id'], hubot_url) finally: return repeat
def get_dynamodb_tables(region, acc, **kwargs): tables = [] # catch exception here, original agent policy does not allow scanning dynamodb try: ddb = boto3.client('dynamodb', region_name=region) paginator = ddb.get_paginator('list_tables') ts = call_and_retry( lambda: paginator.paginate(PaginationConfig={'MaxItems': MAX_PAGE}).build_full_result()['TableNames']) tables = [] for tn in ts: t = call_and_retry(ddb.describe_table, TableName=tn)['Table'] if t['TableStatus'] not in ['ACTIVE', 'UPDATING']: continue table = { 'id': entity_id('dynamodb-{}[{}:{}]'.format(t['TableName'], acc, region)), 'region': region, 'created_by': 'agent', 'infrastructure_account': '{}'.format(acc), 'type': 'dynamodb', 'name': '{}'.format(t['TableName']), 'arn': '{}'.format(t['TableArn']) } tables.append(table) except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.log_kv({'exception': 'Got exception while listing dynamodb tables, IAM role has no access?'}) logger.exception('Got exception while listing dynamodb tables, IAM role has no access?') return tables
def notify(cls, alert, url=None, body=None, params=None, headers=None, timeout=5, oauth2=False, include_alert=True, repeat=0, **kwargs): current_span = extract_span_from_kwargs(**kwargs) urls = cls._config.get('notifications.http.whitelist.urls', []) allow_any = cls._config.get('notifications.http.allow.all', False) default_url = cls._config.get('notifications.http.default.url', None) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity', {}) is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity.get('id')) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) if isinstance(urls, basestring): urls = urls.replace(' ', '').split(',') if not url and not default_url: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'Missing URL!'}) raise NotificationError('URL is required!') if not url: url = default_url elif not allow_any and url not in urls: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'URL is not in whitelist'}) raise NotificationError('URL "{}" is not allowed. Please check worker white list URLs.'.format(url)) if not is_absolute_http_url(url): current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'Absolute URL required!'}) raise NotificationError('Absolute URL is required!') # HTTP headers. if not headers: headers = {} default_headers = cls._config.get('notifications.http.headers', {}) default_headers.update(headers) if oauth2: headers.update({'Authorization': 'Bearer {}'.format(tokens.get('uid'))}) headers['User-Agent'] = get_user_agent() if include_alert: data = { 'alert': alert, 'body': body, } else: data = body try: logger.info('Sending HTTP POST request to {}'.format(url)) r = requests.post(url, data=json.dumps(data, cls=JsonDataEncoder), params=params, headers=headers, timeout=timeout) r.raise_for_status() except Exception: current_span.set_tag('error', True) logger.exception('Request failed!') return repeat
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) repeat = kwargs.get('repeat', 0) alert_def = alert['alert_def'] per_entity = kwargs.get('per_entity', True) current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity', {}) is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity.get('id')) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) if not cls._config.get('notifications.mail.on', True): current_span.set_tag('mail_enabled', False) logger.info('Not sending email for alert: {}. Mail notification is not enabled.'.format(alert_def['id'])) return repeat if not is_changed and not per_entity: return repeat sender = cls._config.get('notifications.mail.sender') subject = cls._get_subject(alert, custom_message=kwargs.get('subject')) html = kwargs.get('html', False) cc = kwargs.get('cc', []) if type(cc) is not list: cc = [cc] hide_recipients = kwargs.get('hide_recipients', True) include_value = kwargs.get('include_value', True) include_definition = kwargs.get('include_definition', True) include_captures = kwargs.get('include_captures', True) include_entity = kwargs.get('include_entity', True) expanded_alert_name = cls._get_expanded_alert_name(alert) zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_def['id'])) if zmon_host else '' try: tmpl = jinja_env.get_template('alert.txt') body_plain = tmpl.render(expanded_alert_name=expanded_alert_name, include_value=include_value, include_definition=include_definition, include_captures=include_captures, include_entity=include_entity, alert_url=alert_url, **alert) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Error parsing email template for alert %s with id %s', alert_def['name'], alert_def['id']) else: if html: current_span.set_tag('html', True) msg = MIMEMultipart('alternative') tmpl = jinja_env.get_template('alert.html') body_html = tmpl.render(expanded_alert_name=expanded_alert_name, include_value=include_value, include_definition=include_definition, include_captures=include_captures, include_entity=include_entity, alert_url=alert_url, **alert) part1 = MIMEText(body_plain.encode('utf-8'), 'plain', 'utf-8') part2 = MIMEText(body_html.encode('utf-8'), 'html', 'utf-8') msg.attach(part1) msg.attach(part2) else: msg = MIMEText(body_plain.encode('utf-8'), 'plain', 'utf-8') msg['Subject'] = subject msg['From'] = 'ZMON 2 <{}>'.format(sender) args = BaseNotification.resolve_group(args) if hide_recipients: msg['To'] = 'Undisclosed Recipients <{}>'.format(sender) msg['Bcc'] = ', '.join(args) else: msg['To'] = ', '.join(args) msg['Cc'] = ', '.join(cc) mail_host = cls._config.get('notifications.mail.host', 'localhost') mail_port = cls._config.get('notifications.mail.port', '25') try: if mail_host != 'localhost': if cls._config.get('notifications.mail.tls', False): logger.info('Mail notification using TLS!') current_span.set_tag('tls', True) s = smtplib.SMTP(mail_host, mail_port) s.ehlo() if not s.has_extn('STARTTLS'): raise NotificationError('Mail server ({}) does not support TLS!'.format(mail_host)) s.starttls() s.ehlo() else: current_span.set_tag('tls', False) s = smtplib.SMTP_SSL(mail_host, mail_port) else: s = smtplib.SMTP(mail_host, mail_port) except Exception: current_span.set_tag('error', True) logger.exception('Error connecting to SMTP server %s for alert %s with id %s', mail_host, alert_def['name'], alert_def['id']) else: try: mail_user = cls._config.get('notifications.mail.user', None) if mail_user is not None: s.login(mail_user, cls._config.get('notifications.mail.password')) s.sendmail(sender, list(args) + cc, msg.as_string()) except SMTPAuthenticationError: logger.exception( 'Error sending email for alert %s with id %s: authentication failed for %s', alert_def['name'], alert_def['id'], mail_user) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception( 'Error sending email for alert %s with id %s', alert_def['name'], alert_def['id']) finally: s.quit() finally: return repeat
def get_running_apps(region, existing_entities=None, **kwargs): aws_client = boto3.client('ec2', region_name=region) paginator = aws_client.get_paginator('describe_instances') rs = call_and_retry( lambda: paginator.paginate(PaginationConfig={'MaxItems': MAX_PAGE}).build_full_result()['Reservations']) now = datetime.now() existing_instances = ( {e['aws_id']: e for e in existing_entities if e['type'] == 'instance'} if existing_entities else {} ) result = [] images = set() for r in rs: owner = r['OwnerId'] instances = r['Instances'] for i in instances: if str(i['State']['Name']) != 'running': continue if (now.minute % 7) and i['InstanceId'] in existing_instances: ins = existing_instances[i['InstanceId']] if 'image' in ins: images.add(ins['image']['id']) else: user_data = None try: user_data_response = call_and_retry(aws_client.describe_instance_attribute, InstanceId=i['InstanceId'], Attribute='userData') user_data = base64.b64decode(user_data_response['UserData']['Value']) user_data = yaml.safe_load(user_data) except Exception: pass tags = get_tags_dict(i.get('Tags', [])) is_spot_instance = True if i.get('InstanceLifecycle', '') == 'spot' else False ins = { 'type': 'instance', 'created_by': 'agent', 'region': region, 'ip': i['PrivateIpAddress'], 'host': i['PrivateIpAddress'], 'instance_type': i['InstanceType'], 'spot_instance': is_spot_instance, 'aws_id': i['InstanceId'], 'infrastructure_account': 'aws:{}'.format(owner), } ins['image'] = {} if 'ImageId' in i: images.add(i['ImageId']) ins['image'] = {'id': i['ImageId']} ins['block_devices'] = get_instance_devices(aws_client, i) if 'PublicIpAddress' in i: public_ip = i.get('PublicIpAddress') if public_ip != '' and public_ip is not None: ins.update({'public_ip': public_ip}) # for now limit us to instances with valid user data ( senza/taupage ) if isinstance(user_data, dict) and 'application_id' in user_data: ins['state_reason'] = i['StateTransitionReason'] ins['events'] = [] stack_version = user_data.get('application_version', 'NOT_SET') if 'StackVersion' in tags: ins['stack'] = tags['Name'] stack_version = tags['StackVersion'] if 'aws:cloudformation:logical-id' in tags: ins['resource_id'] = tags['aws:cloudformation:logical-id'] ins['id'] = entity_id('{}-{}-{}[aws:{}:{}]'.format(user_data['application_id'], stack_version, get_hash(i['PrivateIpAddress'] + ''), owner, region)) ins['application_id'] = user_data['application_id'] if 'application_version' in user_data: ins['application_version'] = user_data['application_version'] ins['source'] = user_data['source'] ins['source_base'] = ins['source'].split(":")[0] if 'ports' in user_data: ins['ports'] = user_data['ports'] ins['runtime'] = user_data['runtime'] # `tags` is already a dict, but we need the raw list assign_properties_from_tags(ins, i.get('Tags', [])) add_traffic_tags_to_entity(ins) zlogging = user_data.get('logging', {}) ins['fluentd_enabled'] = 'false' if zlogging.get('fluentd_enabled') is True: ins['fluentd_enabled'] = 'true' else: ins['id'] = entity_id('{}-{}[aws:{}:{}]'.format(tags.get('Name') or i['InstanceId'], get_hash(i['PrivateIpAddress'] + ''), owner, region)) # `tags` is already a dict, but we need the raw list assign_properties_from_tags(ins, i.get('Tags', [])) if 'Name' in tags: ins['name'] = tags['Name'].replace(' ', '-') if 'application_id' in ins: if not (now.minute % 10): ins['events'] = get_instance_events(aws_client, i) ins['block_devices'] = get_instance_devices(aws_client, i) else: e = existing_instances.get(ins.get('aws_id', None), None) if e and 'events' in e: ins['events'] = e['events'] result.append(ins) imgs = [] # prevent fetching all images (in case the images is empty, it will do so): if list(images): try: imgs = aws_client.describe_images(ImageIds=list(images))['Images'] for i in result: if 'image' not in i or 'id' not in i['image']: continue for img in imgs: if img['ImageId'] == i['image']['id']: i['image']['name'] = img.get('Name', 'UNKNOWN') date = img.get('CreationDate', '1970-01-01T00:00:00.000+00:00').replace('Z', '+00:00') i['image']['date'] = date break except Exception: current_span = extract_span_from_kwargs(**kwargs) current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to retrieve image descriptions') return result
def notify(cls, alert, teams=None, per_entity=False, include_alert=True, include_captures=False, priority=None, message='', description='', custom_fields=None, **kwargs): current_span = extract_span_from_kwargs(**kwargs) url = 'https://api.opsgenie.com/v2/alerts' repeat = kwargs.get('repeat', 0) # Auth key! api_key = kwargs.get('api_key', cls._config.get('notifications.opsgenie.apikey')) zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) if not api_key: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'API key is required!'}) raise NotificationError('API key is required!') if not isinstance(teams, (list, basestring)): current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'Missing team!'}) raise NotificationError('Missing "teams" parameter. Either a team name or list of team names is required.') current_span.log_kv({'teams': teams}) if priority and priority not in PRIORITIES: current_span.set_tag('notification_invalid', True) current_span.log_kv({'reason': 'Invalid priorities'}) raise NotificationError('Invalid priority. Valid values are: {}'.format(PRIORITIES)) if teams and isinstance(teams, basestring): teams = [{'name': teams}] else: teams = [{'name': t} for t in teams] if not is_changed and not per_entity: return repeat alert_id = alert['alert_def']['id'] alias = 'ZMON-{}'.format(alert_id) if not per_entity else 'ZMON-{}-{}'.format(alert_id, entity['id']) note = alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '' if not priority: priority = 'P1' if int(alert['alert_def']['priority']) == 1 else 'P3' responsible_team = alert['alert_def'].get('responsible_team', teams[0]['name']) msg = message if message else cls._get_subject(alert, include_event=False) details = { 'alert_evaluation_ts': alert.get('alert_evaluation_ts', time.time()) } alert_details = { 'worker': alert['worker'], 'zmon_team': alert['alert_def']['team'], 'entity': entity['id'], 'infrastructure_account': entity.get('infrastructure_account', 'UNKNOWN'), 'alert_url': alert_url, } params = {} if is_alert: tags = alert['alert_def'].get('tags', []) tags.append(alert['alert_def']['id']) data = { 'alias': alias, 'teams': teams, 'message': '[{}] - {}'.format(responsible_team, msg), # TODO: remove when it is no longer needed! 'source': alert.get('worker', ''), 'description': description, 'entity': entity['id'], 'note': note, 'priority': priority, 'tags': tags, 'details': details, } if isinstance(custom_fields, dict): data['details'].update(custom_fields) if include_alert: data['details'].update(alert_details) if include_captures: data['details'].update(alert.get('captures')) else: logger.info('Closing Opsgenie alert {}'.format(alias)) url = 'https://api.opsgenie.com/v2/alerts/{}/close'.format(alias) data = { 'user': '******', 'source': alert.get('worker', 'ZMON Worker'), 'note': note, } params = {'identifierType': 'alias'} try: logger.info('Notifying Opsgenie %s %s', url, message) headers = { 'User-Agent': get_user_agent(), 'Content-type': 'application/json', 'Authorization': 'GenieKey {}'.format(api_key), } r = requests.post(url, data=json.dumps(data, cls=JsonDataEncoder, sort_keys=True), headers=headers, timeout=5, params=params) r.raise_for_status() except requests.HTTPError as e: current_span.set_tag('error', True) logger.error('HTTP Error ({}) {}'.format(e.response.status_code, e.response.text)) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Notifying Opsgenie failed') return repeat
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) webhook_link = kwargs.get('webhook_link', 'http://no.webhook.link?wrong') multiline = kwargs.get('multiline', True) webhook_link_split = webhook_link.split('?') alert_id = alert['alert_def']['id'] webhook_link = webhook_link_split[0] + '?threadKey={}&'.format(alert_id) + webhook_link_split[1] repeat = kwargs.get('repeat', 0) alert_def = alert['alert_def'] current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) current_span.log_kv({'room': kwargs.get('room')}) color = '#0CB307' if alert and not alert.get('is_alert') else kwargs.get('color', '#FF0000') logo = 'FLIGHT_ARRIVAL' if alert and not alert.get('is_alert') else kwargs.get('logo', 'FLIGHT_DEPARTURE') message_text = cls._get_subject(alert, custom_message=kwargs.get('message')) zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '' message = { "cards": [ { "sections": [ { "widgets": [ { "keyValue": { "content": '<font color="{}">{}!</font>'.format(color, message_text), "contentMultiline": multiline, "onClick": { "openLink": { "url": "{}".format(alert_url) } }, "icon": "{}".format(logo) } } ] } ] } ] } try: logger.info( 'Sending to: ' + '{}'.format(webhook_link) + ' ' + json.dumps(message)) r = requests.post( '{}'.format(webhook_link), json=message, headers={'Content-type': 'application/json'}, timeout=5) r.raise_for_status() except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Google Hangouts Chat write failed!') return repeat
def get_sqs_queues(region, acc, all_entities=None, **kwargs): current_span = extract_span_from_kwargs(**kwargs) if all_entities is None: all_entities = [] sqs_queues = [] try: sqs_client = boto3.client('sqs', region_name=region) list_queues_response = call_and_retry(sqs_client.list_queues) or {} existing_entities = {e['url']: e for e in all_entities if e['type'] == 'aws_sqs'} for queue_url in list_queues_response.get('QueueUrls', []): try: existing_entity = existing_entities.get(queue_url, None) if existing_entity and (datetime.now().minute % 15): sqs_queues.append(existing_entity) else: attributes_response = call_and_retry(sqs_client.get_queue_attributes, QueueUrl=queue_url, AttributeNames=['All']) attributes = attributes_response['Attributes'] queue_arn = attributes['QueueArn'] arn_tokens = queue_arn.split(':') if len(arn_tokens) == 6: queue_name = arn_tokens[-1] else: logger.error('Illegal SQS queue ARN: "%s" while processing url %s', queue_arn, queue_url) continue sqs_entity = { 'id': entity_id('sqs-{}[{}:{}]'.format(queue_name, acc, region)), 'created_by': 'agent', 'infrastructure_account': acc, 'region': region, 'type': 'aws_sqs', 'name': queue_name, 'url': queue_url, 'arn': queue_arn, 'message_retention_period_seconds': int(attributes.get('MessageRetentionPeriod', 345600)), 'maximum_message_size_bytes': int(attributes.get('MaximumMessageSize', 262144)), 'receive_messages_wait_time_seconds': int(attributes.get('ReceiveMessageWaitTimeSeconds', 0)), 'delay_seconds': int(attributes.get('DelaySeconds', 0)), 'visibility_timeout_seconds': int(attributes.get('VisibilityTimeout', 30))} redrive_policy = json.loads(attributes.get('RedrivePolicy', '{}')) dead_letter_target_arn = redrive_policy.get('deadLetterTargetArn', None) if dead_letter_target_arn: sqs_entity['redrive_policy_dead_letter_target_arn'] = dead_letter_target_arn max_receive_count = redrive_policy.get('maxReceiveCount', None) if max_receive_count: sqs_entity['redrive_policy_max_receive_count'] = max_receive_count dl_sources_response = call_and_retry(sqs_client.list_dead_letter_source_queues, QueueUrl=queue_url) dead_letter_source_urls = dl_sources_response.get('queueUrls', None) if dead_letter_source_urls: sqs_entity['redrive_policy_dead_letter_source_urls'] = dead_letter_source_urls sqs_queues.append(sqs_entity) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to obtain details about queue with url="%s"', queue_url) except Exception as e: if isinstance(e, ClientError) and e.response['Error']['Code'] == 'AccessDenied': logger.warning('Access to AWS SQS denied. Skip queue discovery.') else: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to list SQS queues.') return sqs_queues
def get_limits(region, acc, apps, elbs, entities, **kwargs): current_span = extract_span_from_kwargs(**kwargs) limits = { 'ec2-max-instances': 20, 'ec2-max-spot-instances': 20, # Assume default max-spot-instances 'elb-max-count': 20, } for e in entities: if e.get('type') == 'aws_limits': limits.update(e) break limits.update({ 'ec2-used-instances': len([a for a in apps if a['type'] == 'instance' and not a.get('spot_instance', False)]), 'ec2-used-spot-instances': len([a for a in apps if a['type'] == 'instance' and a.get('spot_instance', False)]), 'elb-used-count': len(elbs), }) ec2 = boto3.client('ec2', region_name=region) rds = boto3.client('rds', region_name=region) asg = boto3.client('autoscaling', region_name=region) iam = boto3.client('iam', region_name=region) try: attrs = ec2.describe_account_attributes()['AccountAttributes'] for attr in attrs: if attr['AttributeName'] == 'max-instances': limits['ec2-max-instances'] = int(attr['AttributeValues'][0]['AttributeValue']) except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to query EC2 account attributes!') try: quota_names = ('ReservedDBInstances', 'AllocatedStorage') quotas = rds.describe_account_attributes()['AccountQuotas'] q = { q['AccountQuotaName']: q for q in quotas if q['AccountQuotaName'] in quota_names } limits['rds-max-reserved'] = q['ReservedDBInstances']['Max'] limits['rds-used-reserved'] = q['ReservedDBInstances']['Used'] limits['rds-max-allocated'] = q['AllocatedStorage']['Max'] limits['rds-used-allocated'] = q['AllocatedStorage']['Used'] except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to query RDS account attributes!') try: asg_limits = asg.describe_account_limits() limits['asg-max-groups'] = asg_limits['MaxNumberOfAutoScalingGroups'] limits['asg-max-launch-configurations'] = asg_limits['MaxNumberOfLaunchConfigurations'] limits['asg-used-groups'] = asg_limits['NumberOfAutoScalingGroups'] limits['asg-used-launch-configurations'] = asg_limits['NumberOfLaunchConfigurations'] except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to query ASG limits!') try: iam_limits = iam.get_account_summary()['SummaryMap'] limits['iam-used-server-certificates'] = iam_limits['ServerCertificates'] limits['iam-max-server-certificates'] = iam_limits['ServerCertificatesQuota'] limits['iam-used-instance-profiles'] = iam_limits['InstanceProfiles'] limits['iam-max-instance-profiles'] = iam_limits['InstanceProfilesQuota'] limits['iam-used-policies'] = iam_limits['Policies'] limits['iam-max-policies'] = iam_limits['PoliciesQuota'] except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Failed to query IAM account summary!') entity = { 'id': entity_id('aws-limits[{}:{}]'.format(acc, region)), 'type': 'aws_limits', 'created_by': 'agent', 'region': region, 'infrastructure_account': acc, } entity.update(limits) return entity