def server_up(req): ''' Hit serverup.txt to check any of the below item with always_check: True Hit serverup.txt?celery (or heartbeat) to check a specific service View that just returns "success", which can be hooked into server monitoring tools like: pingdom ''' checkers = { "heartbeat": { "always_check": False, "check_func": checks.check_heartbeat, }, "celery": { "always_check": True, "check_func": checks.check_celery, }, "postgres": { "always_check": True, "check_func": checks.check_postgres, }, "couch": { "always_check": True, "check_func": checks.check_couch, }, "redis": { "always_check": True, "check_func": checks.check_redis, }, "formplayer": { "always_check": True, "check_func": checks.check_formplayer, }, "elasticsearch": { "always_check": True, "check_func": checks.check_elasticsearch, }, } failed = False message = ['Problems with HQ (%s):' % os.uname()[1]] for check, check_info in checkers.items(): if check_info['always_check'] or req.GET.get(check, None) is not None: try: status = check_info['check_func']() except Exception: # Don't display the exception message status = checks.ServiceStatus(False, "{} has issues".format(check)) if not status.success: failed = True message.append(status.msg) if failed and not is_deploy_in_progress(): create_datadog_event( 'Serverup check failed', '\n'.join(message), alert_type='error', aggregation_key='serverup', ) return HttpResponse('<br>'.join(message), status=500) else: return HttpResponse("success")
def _all_zeros(data): values = [(kpi['value'] == 0 and kpi['all'] == 0) for row in data['records'] for kpi in row] if all(values): create_datadog_event('ICDS 0s', 'All indicators in program summary equals 0', aggregation_key='icds_0') return all(values)
def _all_zeros(data, agg_level): values = [(not kpi['value'] and not kpi['all']) for row in data['records'] for kpi in row] retry = False if agg_level <= 1: retry = any(values) else: retry = all(values) if retry: create_datadog_event('ICDS 0s', 'All indicators in program summary equals 0', aggregation_key='icds_0') return retry
def server_up(req): ''' Hit serverup.txt to check any of the below item with always_check: True Hit serverup.txt?celery (or heartbeat) to check a specific service View that just returns "success", which can be hooked into server monitoring tools like: pingdom ''' checkers = { "heartbeat": { "always_check": False, "message": "* celery heartbeat is down", "check_func": hb_check }, "celery": { "always_check": True, "message": "* celery is down", "check_func": celery_check }, "postgres": { "always_check": True, "message": "* postgres has issues", "check_func": pg_check }, "couch": { "always_check": True, "message": "* couch has issues", "check_func": couch_check }, "redis": { "always_check": True, "message": "* redis has issues", "check_func": redis_check }, } failed = False message = ['Problems with HQ (%s):' % os.uname()[1]] for check, check_info in checkers.items(): if check_info['always_check'] or req.GET.get(check, None) is not None: check_results, custom_msg = check_info['check_func']() if not check_results: failed = True if custom_msg: message.append(custom_msg) else: message.append(check_info['message']) if failed: create_datadog_event( 'Serverup check failed', '\n'.join(message), alert_type='error', aggregation_key='serverup', ) return HttpResponse('<br>'.join(message), status=500) else: return HttpResponse("success")
def server_up(req): ''' Hit serverup.txt to check any of the below item with always_check: True Hit serverup.txt?celery (or heartbeat) to check a specific service View that just returns "success", which can be hooked into server monitoring tools like: pingdom ''' checkers = { "heartbeat": { "always_check": False, "check_func": checks.check_heartbeat, }, "celery": { "always_check": True, "check_func": checks.check_celery, }, "postgres": { "always_check": True, "check_func": checks.check_postgres, }, "couch": { "always_check": True, "check_func": checks.check_couch, }, "redis": { "always_check": True, "check_func": checks.check_redis, }, "formplayer": { "always_check": True, "check_func": checks.check_formplayer }, } failed = False message = ['Problems with HQ (%s):' % os.uname()[1]] for check, check_info in checkers.items(): if check_info['always_check'] or req.GET.get(check, None) is not None: try: status = check_info['check_func']() except Exception: # Don't display the exception message status = checks.ServiceStatus(False, "{} has issues".format(check)) if not status.success: failed = True message.append(status.msg) if failed and not is_deploy_in_progress(): create_datadog_event( 'Serverup check failed', '\n'.join(message), alert_type='error', aggregation_key='serverup', ) return HttpResponse('<br>'.join(message), status=500) else: return HttpResponse("success")
def server_up(req): """ Health check view which can be hooked into server monitoring tools like 'pingdom' Returns: HttpResponse("success", status_code=200) HttpResponse(error_message, status_code=500) Hit serverup.txt to check all the default enabled services (always_check=True) Hit serverup.txt?only={check_name} to only check a specific service Hit serverup.txt?{check_name} to include a non-default check (currently only ``heartbeat``) """ only = req.GET.get('only', None) if only and only in CHECKS: checks_to_do = [only] else: checks_to_do = [ check for check, check_info in CHECKS.items() if check_info['always_check'] or req.GET.get(check, None) is not None ] statuses = run_checks(checks_to_do) failed_checks = [(check, status) for check, status in statuses if not status.success] for check_name, status in statuses: tags = [ 'status:{}'.format('failed' if not status.success else 'ok'), 'check:{}'.format(check_name) ] datadog_gauge('commcare.serverup.check', status.duration, tags=tags) if failed_checks and not is_deploy_in_progress(): status_messages = [ html.linebreaks('<strong>{}</strong>: {}'.format( check, html.escape(status.msg)).strip()) for check, status in failed_checks ] create_datadog_event( 'Serverup check failed', '\n'.join(status_messages), alert_type='error', aggregation_key='serverup', ) status_messages.insert(0, 'Failed Checks (%s):' % os.uname()[1]) return HttpResponse(''.join(status_messages), status=500) else: return HttpResponse("success")
def _all_zeros_graph(step, data, agg_level): if step == 'map': if agg_level <= 3: map_data_by_location = data['data'] else: map_data_by_location = data['tooltips_data'] values = [not all(map_data_by_location[key].values()) for key in map_data_by_location if key not in ['original_name', 'fillKey']] else: values = [(not location['value']) for location in data['all_locations']] retry = all(values) if retry: create_datadog_event('ICDS 0s', 'All indicators in awc_covered equals 0', aggregation_key='icds_0') return retry
def server_up(req): """ Health check view which can be hooked into server monitoring tools like 'pingdom' Returns: HttpResponse("success", status_code=200) HttpResponse(error_message, status_code=500) Hit serverup.txt to check all the default enabled services (always_check=True) Hit serverup.txt?only={check_name} to only check a specific service Hit serverup.txt?{check_name} to include a non-default check (currently only ``heartbeat``) """ only = req.GET.get('only', None) if only and only in CHECKS: checks_to_do = [only] else: checks_to_do = [ check for check, check_info in CHECKS.items() if check_info['always_check'] or req.GET.get(check, None) is not None ] statuses = run_checks(checks_to_do) failed_checks = [(check, status) for check, status in statuses if not status.success] for check_name, status in statuses: tags = [ 'status:{}'.format('failed' if not status.success else 'ok'), 'check:{}'.format(check_name) ] datadog_gauge('commcare.serverup.check', status.duration, tags=tags) if failed_checks and not is_deploy_in_progress(): status_messages = [ html.linebreaks('<strong>{}</strong>: {}'.format(check, html.escape(status.msg)).strip()) for check, status in failed_checks ] create_datadog_event( 'Serverup check failed', '\n'.join(status_messages), alert_type='error', aggregation_key='serverup', ) status_messages.insert(0, 'Failed Checks (%s):' % os.uname()[1]) return HttpResponse(''.join(status_messages), status=500) else: return HttpResponse("success")
def _bust_awc_cache(): create_datadog_event('redis: delete dashboard keys', 'start') reach_keys = cache.keys('*cas_reach_data*') for key in reach_keys: cache.delete(key) create_datadog_event('redis: delete dashboard keys', 'finish')
def record_command_event(sender, args, kwargs, outcome, **extra): if isinstance(outcome, BaseException): outcome = f'{outcome.__class__}: {outcome}' text = f'args: {args}\noptions: {kwargs}\noutcome: {outcome}' event = '{}'.format(sender.__name__) create_datadog_event(event, text, aggregation_key=sender.__name__)