def monitor_by_email(self, users): mail_service = AppSetting.get_default('mail_service', {}) body = [ f'告警名称:{self.title}', f'告警对象:{self.target}', f'{"告警" if self.event == "1" else "恢复"}时间:{human_datetime()}', f'告警描述:{self.message}' ] if self.event == '2': body.append('故障持续:' + self.duration) if mail_service.get('server'): event_map = {'1': '监控告警通知', '2': '告警恢复通知'} subject = f'{event_map[self.event]}-{self.title}' mail = Mail(**mail_service) mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。') elif self.spug_key: data = { 'token': self.spug_key, 'event': self.event, 'subject': self.title, 'body': '\r\n'.join(body), 'users': list(users) } self.handle_request(f'{spug_server}/apis/notify/mail/', data, 'spug') else: Notify.make_monitor_notify('发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
def notify_by_dd(event, subject, grp): _, u_ids = _parse_args(grp) users = set( x.ding for x in Contact.objects.filter(id__in=u_ids, ding__isnull=False)) if users: texts = [ '## %s ## ' % '监控告警通知' if event == '1' else '告警恢复通知', f'**告警名称:** <font color="#{"f90202" if event == "1" else "8ece60"}">{subject}</font> ', f'**告警时间:** {human_datetime()} ', '**告警描述:** %s ' % '请在运维平台监控中心查看详情' if event == '1' else '告警已恢复', '> ###### 来自 Spug运维平台' ] data = { 'msgtype': 'markdown', 'markdown': { 'title': '监控告警通知', 'text': '\n\n'.join(texts) } } for url in users: requests.post(url, json=data) else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。')
def notify_by_qy_wx(event, obj): _, u_ids = _parse_args(obj.grp) users = set( x.qy_wx for x in Contact.objects.filter(id__in=u_ids, qy_wx__isnull=False)) if users: color, title = ('warning', '监控告警通知') if event == '1' else ('info', '告警恢复通知') texts = [ f'## {title}', f'**告警名称:** <font color="{color}">{obj.name}</font> ', f'**告警时间:** {human_datetime()} ', f'**告警描述:** {obj.out} ', ] if event == '2': texts.append(f'**持续时间:** {obj.duration} ') data = { 'msgtype': 'markdown', 'markdown': { 'content': '\n'.join(texts) + '\n> 来自 Spug运维平台' } } for url in users: requests.post(url, json=data) else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。')
def notify_by_email(event, subject, grp): spug_key, u_ids = _parse_args(grp) users = set( x.email for x in Contact.objects.filter(id__in=u_ids, email__isnull=False)) if users: mail_service = json.loads(AppSetting.get_default('mail_service', '{}')) if mail_service.get('server'): event_map = {'1': '告警', '2': '恢复'} subject = f'{event_map[event]}-{subject}' mail = Mail(**mail_service) mail.send_text_mail(users, subject, f'{subject}\r\n\r\n自动发送,请勿回复。') elif spug_key: data = { 'token': spug_key, 'event': event, 'subject': subject, 'users': list(users) } requests.post(f'{spug_server}/apis/notify/mail/', json=data) else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。')
def notify_by_email(event, obj): spug_key, u_ids = _parse_args(obj.grp) users = set( x.email for x in Contact.objects.filter(id__in=u_ids, email__isnull=False)) if users: mail_service = json.loads(AppSetting.get_default('mail_service', '{}')) body = [ '告警名称:' + obj.name, '告警时间:' + human_datetime(), '告警描述:' + obj.out ] if event == '2': body.append('故障持续:' + obj.duration) if mail_service.get('server'): event_map = {'1': '告警发生', '2': '告警恢复'} subject = f'{event_map[event]}-{obj.name}' mail = Mail(**mail_service) mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。') elif spug_key: data = { 'token': spug_key, 'event': event, 'subject': obj.name, 'body': '\r\n'.join(body), 'users': list(users) } requests.post(f'{spug_server}/apis/notify/mail/', json=data) else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。')
def notify_by_dd(event, obj): _, u_ids = _parse_args(obj.grp) users = set( x.ding for x in Contact.objects.filter(id__in=u_ids, ding__isnull=False)) if users: texts = [ '## %s ## ' % ('监控告警通知' if event == '1' else '告警恢复通知'), f'**告警名称:** <font color="#{"f90202" if event == "1" else "8ece60"}">{obj.name}</font> ', f'**告警时间:** {human_datetime()} ', f'**告警描述:** {obj.out} ', ] if event == '2': texts.append(f'**持续时间:** {obj.duration} ') data = { 'msgtype': 'markdown', 'markdown': { 'title': '监控告警通知', 'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台' } } for url in users: requests.post(url, json=data) else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。')
def _handle_event(self, event): close_old_connections() obj = SimpleLazyObject(lambda: Task.objects.filter(pk=event.job_id).first()) if event.code == EVENT_SCHEDULER_SHUTDOWN: logger.info(f'EVENT_SCHEDULER_SHUTDOWN: {event}') Notify.make_notify('schedule', '1', '调度器已关闭', '调度器意外关闭,你可以在github上提交issue') elif event.code == EVENT_JOB_MAX_INSTANCES: logger.info(f'EVENT_JOB_MAX_INSTANCES: {event}') send_fail_notify(obj, '达到调度实例上限,一般为上个周期的执行任务还未结束,请增加调度间隔或减少任务执行耗时') elif event.code == EVENT_JOB_ERROR: logger.info(f'EVENT_JOB_ERROR: job_id {event.job_id} exception: {event.exception}') send_fail_notify(obj, f'执行异常:{event.exception}') elif event.code == EVENT_JOB_EXECUTED: if event.retval: score = 0 for item in event.retval: score += 1 if item[1] else 0 history = History.objects.create( task_id=event.job_id, status=2 if score == len(event.retval) else 1 if score else 0, run_time=human_datetime(event.scheduled_run_time), output=json.dumps(event.retval) ) Task.objects.filter(pk=event.job_id).update(latest=history) if score != 0: send_fail_notify(obj)
def _parse_args(grp): spug_key = AppSetting.get_default('spug_key') if not spug_key: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') return None, None return spug_key, sum( [json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], [])
def monitor_by_wx(self, users): if not self.spug_key: Notify.make_monitor_notify('发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/基本设置/调用凭据中配置。') return data = { 'token': self.spug_key, 'event': self.event, 'subject': f'{self.title} >> {self.target}', 'desc': self.message, 'remark': f'故障持续{self.duration}' if self.event == '2' else None, 'users': list(users) } self.handle_request(f'{spug_server}/apis/notify/wx/', data, 'spug')
def send_deploy_notify(cls, req, action=None): rst_notify = json.loads(req.deploy.rst_notify) host_ids = json.loads(req.host_ids) if rst_notify['mode'] != '0' and rst_notify.get('value'): extra = json.loads(req.extra) if req.deploy.extend == '1': mode, extra1, extra2 = extra if mode == 'branch': version = f'{extra1}#{extra2[:6]}' else: version = extra1 else: version = extra[0] or '' hosts = [{ 'id': x.id, 'name': x.name } for x in Host.objects.filter(id__in=host_ids)] host_str = ', '.join(x['name'] for x in hosts[:2]) if len(hosts) > 2: host_str += f'等{len(hosts)}台主机' if rst_notify['mode'] == '1': data = cls._make_dd_notify(action, req, version, host_str) elif rst_notify['mode'] == '2': data = { 'action': action, 'req_id': req.id, 'req_name': req.name, 'app_id': req.deploy.app_id, 'app_name': req.deploy.app.name, 'env_id': req.deploy.env_id, 'env_name': req.deploy.env.name, 'status': req.status, 'reason': req.reason, 'version': version, 'targets': hosts, 'is_success': req.status == '3', 'created_at': human_datetime() } elif rst_notify['mode'] == '3': data = cls._make_wx_notify(action, req, version, host_str) else: raise NotImplementedError res = requests.post(rst_notify['value'], json=data) if res.status_code != 200: Notify.make_notify( 'flag', '1', '发布通知发送失败', f'返回状态码:{res.status_code}, 请求URL:{res.url}') if rst_notify['mode'] in ['1', '3']: res = res.json() if res.get('errcode') != 0: Notify.make_notify('flag', '1', '发布通知发送失败', f'返回数据:{res}')
def dispatch_monitor(self, modes): self.u_ids = sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=self.grp)], []) for mode in modes: if mode == '1': users = set(x.wx_token for x in Contact.objects.filter(id__in=self.u_ids, wx_token__isnull=False)) if not users: Notify.make_monitor_notify('发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的微信Token。') continue self.monitor_by_wx(users) elif mode == '3': users = set(x.ding for x in Contact.objects.filter(id__in=self.u_ids, ding__isnull=False)) if not users: Notify.make_monitor_notify('发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。') continue self.monitor_by_dd(users) elif mode == '4': users = set(x.email for x in Contact.objects.filter(id__in=self.u_ids, email__isnull=False)) if not users: Notify.make_monitor_notify('发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。') continue self.monitor_by_email(users) elif mode == '5': users = set(x.qy_wx for x in Contact.objects.filter(id__in=self.u_ids, qy_wx__isnull=False)) if not users: Notify.make_monitor_notify('发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。') continue self.monitor_by_qy_wx(users)
def _handle_event(self, event): obj = SimpleLazyObject( lambda: Task.objects.filter(pk=event.job_id).first()) if event.code == events.EVENT_SCHEDULER_SHUTDOWN: logger.info(f'EVENT_SCHEDULER_SHUTDOWN: {event}') Notify.make_notify('schedule', '1', '调度器已关闭', '调度器意外关闭,你可以在github上提交issue') elif event.code == events.EVENT_JOB_MAX_INSTANCES: logger.info(f'EVENT_JOB_MAX_INSTANCES: {event}') Notify.make_notify('schedule', '1', f'{obj.name} - 达到调度实例上限', '一般为上个周期的执行任务还未结束,请增加调度间隔或减少任务执行耗时') elif event.code == events.EVENT_JOB_ERROR: logger.info( f'EVENT_JOB_ERROR: job_id {event.job_id} exception: {event.exception}' ) Notify.make_notify('schedule', '1', f'{obj.name} - 执行异常', f'{event.exception}') elif event.code == events.EVENT_JOB_EXECUTED: if event.retval: score = 0 for item in event.retval: score += 1 if item[1] else 0 Task.objects.filter(pk=event.job_id).update( latest_status=2 if score == len(event.retval) else 1 if score else 0, latest_run_time=human_datetime(event.scheduled_run_time), latest_output=json.dumps(event.retval)) if score != 0 and time.time() - counter.get(event.job_id, 0) > 3600: counter[event.job_id] = time.time() Notify.make_notify('schedule', '1', f'{obj.name} - 执行失败', '请在任务计划中查看失败详情')
def _do_notify(task, mode, url, msg): if mode == '1': texts = [ '## <font color="#f90202">任务执行失败通知</font> ## ', f'**任务名称:** {task.name} ', f'**任务类型:** {task.types} ', f'**描述信息:** {msg or "请在任务计划执行历史中查看详情"} ', f'**发生时间:** {human_datetime()} ', ] data = { 'msgtype': 'markdown', 'markdown': { 'title': '任务执行失败通知', 'text': '\n\n'.join(texts) } } res = requests.post(url, json=data) elif mode == '3': data = { 'task_id': task.id, 'task_name': task.name, 'task_type': task.type, 'message': msg or '请在任务计划执行历史中查看详情', 'created_at': human_datetime() } res = requests.post(url, json=data) elif mode == '2': texts = [ '## <font color="warning">任务执行失败通知</font>', f'任务名称: {task.name}', f'任务类型: {task.type}', f'描述信息: {msg or "请在任务计划执行历史中查看详情"}', f'发生时间: {human_datetime()}', ] data = { 'msgtype': 'markdown', 'markdown': { 'content': '\n'.join(texts) } } res = requests.post(url, json=data) if res.status_code != 200: Notify.make_notify('schedule', '1', '任务执行通知发送失败', f'返回状态码:{res.status_code}, 请求URL:{url}') if mode in ['1', '3']: res = res.json() if res.get('errcode') != 0: Notify.make_notify('schedule', '1', '任务执行通知发送失败', f'返回数据:{res}')
def notify_by_wx(event, subject, n_grp): spug_key, u_ids = _parse_args(n_grp) if not spug_key: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') return users = set(x.wx_token for x in Contact.objects.filter(id__in=u_ids, wx_token__isnull=False)) if users: data = { 'token': spug_key, 'event': event, 'subject': subject, 'users': list(users) } requests.post(f'{spug_server}/apis/notify/wx/', json=data) else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的微信Token。')
def queue_monitor(self): counter = 0 while True: time.sleep((counter or 1)**3 * 10) qsize = self._executor._work_queue.qsize() if qsize > 0: if counter > 0: content = '请检查监控、任务计划或批量执行等避免长耗时任务,必要时可重启服务清空队列。' try: Notify.make_system_notify(f'执行队列堆积({qsize})', content) except Exception as e: logging.warning(e) finally: connections.close_all() logging.warning(f'!!! 执行队列堆积({qsize})') counter += 1 else: counter = 0
def _handle_event(self, event): close_old_connections() obj = SimpleLazyObject(lambda: Detection.objects.filter(pk=event.job_id).first()) if event.code == EVENT_SCHEDULER_SHUTDOWN: logger.info(f'EVENT_SCHEDULER_SHUTDOWN: {event}') Notify.make_notify('monitor', '1', '调度器已关闭', '调度器意外关闭,你可以在github上提交issue', False) elif event.code == EVENT_JOB_MAX_INSTANCES: logger.info(f'EVENT_JOB_MAX_INSTANCES: {event}') Notify.make_notify('monitor', '1', f'{obj.name} - 达到调度实例上限', '一般为上个周期的执行任务还未结束,请增加调度间隔或减少任务执行耗时') elif event.code == EVENT_JOB_ERROR: logger.info(f'EVENT_JOB_ERROR: job_id {event.job_id} exception: {event.exception}') Notify.make_notify('monitor', '1', f'{obj.name} - 执行异常', f'{event.exception}') elif event.code == EVENT_JOB_EXECUTED: obj = Detection.objects.filter(pk=event.job_id).first() old_status = obj.latest_status obj.latest_status = 0 if event.retval else 1 obj.latest_run_time = human_datetime(event.scheduled_run_time) if old_status in [0, None] and event.retval is False: obj.latest_fault_time = int(time.time()) if obj.latest_status == 0: obj.latest_notify_time = 0 obj.fault_times = 0 else: obj.fault_times += 1 obj.save() self._handle_notify(obj, old_status)
def notify_by_wx(event, obj): spug_key, u_ids = _parse_args(obj.grp) if not spug_key: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') return users = set( x.wx_token for x in Contact.objects.filter(id__in=u_ids, wx_token__isnull=False)) if users: data = { 'token': spug_key, 'event': event, 'subject': obj.name, 'desc': obj.out, 'remark': f'故障持续{obj.duration}' if event == '2' else None, 'users': list(users) } res = requests.post(f'{spug_server}/apis/notify/wx/', json=data) _handle_response(res, 'spug') else: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的微信Token。')
def handle_request(url, data, mode=None): try: res = requests.post(url, json=data, timeout=30) except Exception as e: return Notify.make_system_notify('通知发送失败', f'接口调用异常: {e}') if res.status_code != 200: return Notify.make_system_notify('通知发送失败', f'返回状态码:{res.status_code}, 请求URL:{res.url}') if mode in ['dd', 'wx']: res = res.json() if res.get('errcode') == 0: return elif mode == 'spug': res = res.json() if not res.get('error'): return elif mode == 'fs': res = res.json() if res.get('StatusCode') == 0: return else: raise NotImplementedError Notify.make_system_notify('通知发送失败', f'返回数据:{res}')
def _handle_response(res, mode): if res.status_code != 200: Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL:{res.url}') if mode in ['dd', 'wx']: res = res.json() if res.get('errcode') != 0: Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}') if mode == 'spug': res = res.json() if res.get('error'): Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}')