def reprocess_archive_stubs(): # Check for archive stubs from corehq.form_processor.interfaces.dbaccessors import FormAccessors from couchforms.models import UnfinishedArchiveStub stubs = UnfinishedArchiveStub.objects.filter(attempts__lt=3) datadog_gauge('commcare.unfinished_archive_stubs', len(stubs)) start = time.time() cutoff = start + timedelta(minutes=4).total_seconds() for stub in stubs: # Exit this task after 4 minutes so that tasks remain short if time.time() > cutoff: return try: xform = FormAccessors(stub.domain).get_form(form_id=stub.xform_id) # If the history wasn't updated the first time around, run the whole thing again. if not stub.history_updated: FormAccessors.do_archive(xform, stub.archive, stub.user_id, trigger_signals=True) # If the history was updated the first time around, just send the update to kafka else: FormAccessors.publish_archive_action_to_kafka( xform, stub.user_id, stub.archive) except Exception: # Errors should not prevent processing other stubs notify_exception(None, "Error processing UnfinishedArchiveStub")
def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None, add_case_type_tag=False): if change.metadata is not None: common_tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ] metric_tags = list(common_tags) if add_case_type_tag and settings.ENTERPRISE_MODE and change.metadata.document_type == 'CommCareCase': metric_tags.append('case_type:{}'.format( change.metadata.document_subtype)) datadog_counter(metric, tags=metric_tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if processing_time: datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=common_tags)
def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None): if change.metadata is not None: tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ] count = 1 if processor else len(self.processors) datadog_counter(metric, value=count, tags=tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if processing_time: datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=tags)
def _record_datadog_metrics(self, changes_chunk, processing_time): tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"] # Since success/fail count is tracked per processor, to get sense of # actual operations count, multiply by number of processors count = len(changes_chunk) * len(self.processors) datadog_counter('commcare.change_feed.changes.count', count, tags=tags) max_change_lag = ( datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds() min_change_lag = ( datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags) datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags) datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time, tags=tags + ["chunk_size:{}".format(str(len(changes_chunk)))]) if len(changes_chunk) == self.processor_chunk_size: # don't report offset chunks to ease up datadog calculations datadog_histogram('commcare.change_feed.processing_time', processing_time / len(changes_chunk), tags=tags + ["chunk_size:".format(str(len(changes_chunk)))])
def queue_async_indicators(): start = datetime.utcnow() cutoff = start + ASYNC_INDICATOR_QUEUE_TIME time_for_crit_section = ASYNC_INDICATOR_QUEUE_TIME.seconds - 10 oldest_indicator = AsyncIndicator.objects.order_by('date_queued').first() if oldest_indicator and oldest_indicator.date_queued: lag = (datetime.utcnow() - oldest_indicator.date_queued).total_seconds() datadog_gauge('commcare.async_indicator.oldest_queued_indicator', lag) with CriticalSection(['queue-async-indicators'], timeout=time_for_crit_section): day_ago = datetime.utcnow() - timedelta(days=1) indicators = AsyncIndicator.objects.all()[:settings. ASYNC_INDICATORS_TO_QUEUE] if indicators: lag = (datetime.utcnow() - indicators[0].date_created).total_seconds() datadog_gauge('commcare.async_indicator.oldest_created_indicator', lag) indicators_by_domain_doc_type = defaultdict(list) for indicator in indicators: # don't requeue anything htat's be queued in the past day if not indicator.date_queued or indicator.date_queued < day_ago: indicators_by_domain_doc_type[( indicator.domain, indicator.doc_type)].append(indicator) for k, indicators in indicators_by_domain_doc_type.items(): now = datetime.utcnow() if now > cutoff: break _queue_indicators(indicators)
def async_indicators_metrics(): oldest_indicator = AsyncIndicator.objects.order_by('date_queued').first() if oldest_indicator and oldest_indicator.date_queued: lag = (datetime.utcnow() - oldest_indicator.date_queued).total_seconds() datadog_gauge('commcare.async_indicator.oldest_queued_indicator', lag) indicator = AsyncIndicator.objects.first() if indicator: lag = (datetime.utcnow() - indicator.date_created).total_seconds() datadog_gauge('commcare.async_indicator.oldest_created_indicator', lag) for config_id, metrics in _indicator_metrics().iteritems(): tags = ["config_id:{}".format(config_id)] datadog_gauge('commcare.async_indicator.indicator_count', metrics['count'], tags=tags) datadog_gauge('commcare.async_indicator.lag', metrics['lag'], tags=tags) # Don't use ORM summing because it would attempt to get every value in DB unsuccessful_attempts = sum( AsyncIndicator.objects.values_list('unsuccessful_attempts', flat=True).all()[:100]) datadog_gauge('commcare.async_indicator.unsuccessful_attempts', unsuccessful_attempts)
def fetch_all(initial_response): resp = initial_response scroll_id = resp.get('_scroll_id') if scroll_id is None: return iteration = 0 while True: start = int(time.time() * 1000) resp = client.scroll(scroll_id, scroll=scroll) datadog_gauge('commcare.es_scroll', (time.time() * 1000) - start, tags=[ u'iteration:{}'.format(iteration), ]) for hit in resp['hits']['hits']: yield hit # check if we have any errrors if resp["_shards"]["failed"]: logging.getLogger('elasticsearch.helpers').warning( 'Scroll request has failed on %d shards out of %d.', resp['_shards']['failed'], resp['_shards']['total']) scroll_id = resp.get('_scroll_id') # end of scroll if scroll_id is None or not resp['hits']['hits']: break iteration += 1
def record_pillow_error_queue_size(): data = PillowError.objects.values('pillow').annotate( num_errors=Count('id')) for row in data: datadog_gauge('commcare.pillowtop.errors', row['num_errors'], tags=['pillow_name:%s' % row['pillow']])
def _record_datadog_metrics(self, changes_chunk, processing_time): tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"] change_count = len(changes_chunk) if settings.ENTERPRISE_MODE: type_counter = Counter([ change.metadata.document_subtype for change in changes_chunk if change.metadata.document_type == 'CommCareCase' ]) for case_type, type_count in type_counter.items(): tags_with_type = tags + ['case_type:{}'.format(case_type)] datadog_counter('commcare.change_feed.changes.count', type_count, tags=tags_with_type) remainder = change_count - sum(type_counter.values()) if remainder: datadog_counter('commcare.change_feed.changes.count', remainder, tags=tags) else: datadog_counter('commcare.change_feed.changes.count', change_count, tags=tags) max_change_lag = (datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds() min_change_lag = (datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags) datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags) # processing_time per change datadog_histogram('commcare.change_feed.processing_time', processing_time / change_count, tags=tags) if change_count == self.processor_chunk_size: # don't report offset chunks to ease up datadog calculations datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time, tags=tags + ["chunk_size:{}".format(str(change_count))])
def datadog_report_user_stats(metric_name, commcare_users_by_domain): commcare_users_by_domain = summarize_user_counts(commcare_users_by_domain, n=50) for domain, user_count in commcare_users_by_domain.items(): datadog_gauge( metric_name, user_count, tags=['domain:{}'.format('_other' if domain is () else domain)])
def heartbeat(): try: datadog_gauge('commcare.celery.heartbeat.blockage_duration', self.get_blockage_duration(), tags=['celery_queue:{}'.format(self.queue)]) except HeartbeatNeverRecorded: pass self.mark_seen()
def _report_current_global_submission_thresholds(): for window, value, threshold in global_submission_rate_limiter.iter_rates(): datadog_gauge('commcare.xform_submissions.global_threshold', threshold, tags=[ f'window:{window}' ]) datadog_gauge('commcare.xform_submissions.global_usage', value, tags=[ f'window:{window}' ])
def _report_current_global_two_factor_setup_rate_limiter(): for window, value, threshold in global_two_factor_setup_rate_limiter.iter_rates( ): datadog_gauge('commcare.two_factor.global_two_factor_setup_threshold', threshold, tags=['window:{}'.format(window)]) datadog_gauge('commcare.two_factor.global_two_factor_setup_usage', value, tags=['window:{}'.format(window)])
def _record_checkpoint_in_datadog(self): datadog_counter('commcare.change_feed.change_feed.checkpoint', tags=[ 'pillow_name:{}'.format(self.get_name()), ]) checkpoint_sequence = self._normalize_checkpoint_sequence() for topic, value in six.iteritems(checkpoint_sequence): datadog_gauge('commcare.change_feed.checkpoint_offsets', value, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(topic), ])
def _record_checkpoint_in_datadog(self): datadog_counter('commcare.change_feed.change_feed.checkpoint', tags=[ 'pillow_name:{}'.format(self.get_name()), ]) checkpoint_sequence = self._normalize_checkpoint_sequence() for topic, value in six.iteritems(checkpoint_sequence): datadog_gauge('commcare.change_feed.checkpoint_offsets', value, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(topic), ])
def celery_record_time_to_start(task_id=None, task=None, **kwargs): from corehq.util.datadog.gauges import datadog_gauge, datadog_counter time_to_start = TimeToStartTimer(task_id).stop_and_pop_timing() tags = [ 'celery_task_name:{}'.format(task.name), 'celery_queue:{}'.format(task.queue), ] if time_to_start: datadog_gauge('commcare.celery.task.time_to_start', time_to_start.total_seconds(), tags=tags) else: datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags)
def get_and_report_blockage_duration(self): blockage_duration = self.get_blockage_duration() datadog_gauge('commcare.celery.heartbeat.blockage_duration', blockage_duration.total_seconds(), tags=['celery_queue:{}'.format(self.queue)]) if self.threshold: datadog_gauge( 'commcare.celery.heartbeat.blockage_ok', 1 if blockage_duration.total_seconds() <= self.threshold else 0, tags=['celery_queue:{}'.format(self.queue)]) return blockage_duration
def get_and_report_blockage_duration(self): blockage_duration = self.get_blockage_duration() datadog_gauge( 'commcare.celery.heartbeat.blockage_duration', blockage_duration.total_seconds(), tags=['celery_queue:{}'.format(self.queue)] ) if self.threshold: datadog_gauge( 'commcare.celery.heartbeat.blockage_ok', 1 if blockage_duration.total_seconds() <= self.threshold else 0, tags=['celery_queue:{}'.format(self.queue)] ) return blockage_duration
def pillow_datadog_metrics(): def _is_couch(pillow): # text is couch, json is kafka return pillow['seq_format'] == 'text' pillow_meta = get_all_pillows_json() active_pillows = getattr(settings, 'ACTIVE_PILLOW_NAMES', None) if active_pillows: pillow_meta = [pillow for pillow in pillow_meta if pillow['name'] in active_pillows] for pillow in pillow_meta: tags = [ 'pillow_name:{}'.format(pillow['name']), 'feed_type:{}'.format('couch' if _is_couch(pillow) else 'kafka') ] datadog_gauge( 'commcare.change_feed.seconds_since_last_update', pillow['seconds_since_last'], tags=tags ) for topic_name, offset in pillow['offsets'].items(): if _is_couch(pillow): if not isinstance(pillow['seq'], int) or len(pillow['offsets']) != 1: _assert(False, "Unexpected couch pillow format {}".format(pillow['name'])) continue tags_with_topic = tags + ['topic:{}'.format(topic_name)] processed_offset = pillow['seq'] else: if not pillow['seq']: # this pillow has never been initialized. # (custom pillows on most environments) continue if not isinstance(pillow['seq'], dict) or len(pillow['offsets']) != len(pillow['seq']): _assert(False, "Unexpected kafka pillow format {}".format(pillow['name'])) continue topic, partition = topic_name.split(',') tags_with_topic = tags + ['topic:{}-{}'.format(topic, partition)] processed_offset = pillow['seq'][topic_name] if processed_offset == 0: # assume if nothing has been processed that this pillow is not # supposed to be running continue datadog_gauge( 'commcare.change_feed.current_offsets', offset, tags=tags_with_topic ) datadog_gauge( 'commcare.change_feed.processed_offsets', processed_offset, tags=tags_with_topic ) needs_processing = offset - processed_offset datadog_gauge( 'commcare.change_feed.need_processing', needs_processing, tags=tags_with_topic )
def pillow_datadog_metrics(): def _is_couch(pillow): # text is couch, json is kafka return pillow['seq_format'] == 'text' pillow_meta = get_all_pillows_json() for pillow in pillow_meta: tags = [ 'pillow_name:{}'.format(pillow['name']), 'feed_type:{}'.format('couch' if _is_couch(pillow) else 'kafka') ] datadog_gauge('commcare.change_feed.seconds_since_last_update', pillow['seconds_since_last'], tags=tags) for topic_name, offset in pillow['offsets'].items(): if _is_couch(pillow): if not isinstance(pillow['seq'], int) or len(pillow['offsets']) != 1: _assert( False, "Unexpected couch pillow format {}".format( pillow['name'])) continue tags_with_topic = tags + ['topic:{}'.format(topic_name)] processed_offset = pillow['seq'] else: if not isinstance(pillow['seq'], dict) or len( pillow['offsets']) != len(pillow['seq']): _assert( False, "Unexpected kafka pillow format {}".format( pillow['name'])) continue if not pillow['seq']: # this pillow has never been initialized. # (custom pillows on most environments) continue topic, partition = topic_name.split(',') tags_with_topic = tags + [ 'topic:{}-{}'.format(topic, partition) ] processed_offset = pillow['seq'][topic_name] if processed_offset == 0: # assume if nothing has been processed that this pillow is not # supposed to be running continue datadog_gauge('commcare.change_feed.current_offsets', offset, tags=tags_with_topic) datadog_gauge('commcare.change_feed.processed_offsets', processed_offset, tags=tags_with_topic) needs_processing = offset - processed_offset datadog_gauge('commcare.change_feed.need_processing', needs_processing, tags=tags_with_topic)
def __record_change_metric_in_datadog(self, metric, change, timer=None): if change.metadata is not None: tags = [ u'datasource:{}'.format(change.metadata.data_source_name), u'document_type:{}'.format(change.metadata.document_type), u'domain:{}'.format(change.metadata.domain), u'is_deletion:{}'.format(change.metadata.is_deletion), u'pillow_name:{}'.format(self.get_name()) ] datadog_counter(metric, tags=tags) if timer: datadog_gauge('commcare.change_feed.processing_time', timer.duration, tags=tags)
def pillow_datadog_metrics(): def _is_couch(pillow): # text is couch, json is kafka return pillow['seq_format'] == 'text' pillow_meta = get_all_pillows_json() active_pillows = getattr(settings, 'ACTIVE_PILLOW_NAMES', None) if active_pillows: pillow_meta = [ pillow for pillow in pillow_meta if pillow['name'] in active_pillows ] for pillow in pillow_meta: # The host and group tags are added here to ensure they remain constant # regardless of which celery worker the task get's executed on. # Without this the sum of the metrics get's inflated. tags = [ 'pillow_name:{}'.format(pillow['name']), 'feed_type:{}'.format('couch' if _is_couch(pillow) else 'kafka'), 'host:celery', 'group:celery' ] datadog_gauge('commcare.change_feed.seconds_since_last_update', pillow['seconds_since_last'], tags=tags) for topic_name, offset in pillow['offsets'].items(): if _is_couch(pillow): tags_with_topic = tags + ['topic:{}'.format(topic_name)] processed_offset = pillow['seq'] else: if not pillow['seq']: # this pillow has never been initialized. # (custom pillows on most environments) continue topic, partition = topic_name.split(',') tags_with_topic = tags + [ 'topic:{}-{}'.format(topic, partition) ] processed_offset = pillow['seq'][topic_name] if processed_offset == 0: # assume if nothing has been processed that this pillow is not # supposed to be running continue datadog_gauge('commcare.change_feed.current_offsets', offset, tags=tags_with_topic) datadog_gauge('commcare.change_feed.processed_offsets', processed_offset, tags=tags_with_topic) needs_processing = offset - processed_offset datadog_gauge('commcare.change_feed.need_processing', needs_processing, tags=tags_with_topic)
def server_up(req): """ Health check view which can be hooked into server monitoring tools like 'pingdom' Returns: HttpResponse("success", status_code=200) HttpResponse(error_message, status_code=500) Hit serverup.txt to check all the default enabled services (always_check=True) Hit serverup.txt?only={check_name} to only check a specific service Hit serverup.txt?{check_name} to include a non-default check (currently only ``heartbeat``) """ only = req.GET.get('only', None) if only and only in CHECKS: checks_to_do = [only] else: checks_to_do = [ check for check, check_info in CHECKS.items() if check_info['always_check'] or req.GET.get(check, None) is not None ] statuses = run_checks(checks_to_do) failed_checks = [(check, status) for check, status in statuses if not status.success] for check_name, status in statuses: tags = [ 'status:{}'.format('failed' if not status.success else 'ok'), 'check:{}'.format(check_name) ] datadog_gauge('commcare.serverup.check', status.duration, tags=tags) if failed_checks and not is_deploy_in_progress(): status_messages = [ html.linebreaks('<strong>{}</strong>: {}'.format( check, html.escape(status.msg)).strip()) for check, status in failed_checks ] create_datadog_event( 'Serverup check failed', '\n'.join(status_messages), alert_type='error', aggregation_key='serverup', ) status_messages.insert(0, 'Failed Checks (%s):' % os.uname()[1]) return HttpResponse(''.join(status_messages), status=500) else: return HttpResponse("success")
def celery_record_time_to_start(task_id=None, task=None, **kwargs): from corehq.util.datadog.gauges import datadog_gauge, datadog_counter time_sent = cache.get('task.{}.time_sent'.format(task_id)) tags = [ 'celery_task_name:{}'.format(task.name), 'celery_queue:{}'.format(task.queue), ] if time_sent: time_to_start = (datetime.datetime.utcnow() - time_sent).total_seconds() datadog_gauge('commcare.celery.task.time_to_start', time_to_start, tags=tags) else: datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags)
def __record_change_metric_in_datadog(self, metric, change, timer=None): if change.metadata is not None: tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), ] datadog_counter(metric, tags=tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).seconds datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if timer: datadog_histogram('commcare.change_feed.processing_time', timer.duration, tags=tags)
def server_up(req): """ Health check view which can be hooked into server monitoring tools like 'pingdom' Returns: HttpResponse("success", status_code=200) HttpResponse(error_message, status_code=500) Hit serverup.txt to check all the default enabled services (always_check=True) Hit serverup.txt?only={check_name} to only check a specific service Hit serverup.txt?{check_name} to include a non-default check (currently only ``heartbeat``) """ only = req.GET.get('only', None) if only and only in CHECKS: checks_to_do = [only] else: checks_to_do = [ check for check, check_info in CHECKS.items() if check_info['always_check'] or req.GET.get(check, None) is not None ] statuses = run_checks(checks_to_do) failed_checks = [(check, status) for check, status in statuses if not status.success] for check_name, status in statuses: tags = [ 'status:{}'.format('failed' if not status.success else 'ok'), 'check:{}'.format(check_name) ] datadog_gauge('commcare.serverup.check', status.duration, tags=tags) if failed_checks and not is_deploy_in_progress(): status_messages = [ html.linebreaks('<strong>{}</strong>: {}'.format(check, html.escape(status.msg)).strip()) for check, status in failed_checks ] create_datadog_event( 'Serverup check failed', '\n'.join(status_messages), alert_type='error', aggregation_key='serverup', ) status_messages.insert(0, 'Failed Checks (%s):' % os.uname()[1]) return HttpResponse(''.join(status_messages), status=500) else: return HttpResponse("success")
def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None): if change.metadata is not None: tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ] count = 1 if processor else len(self.processors) datadog_counter(metric, value=count, tags=tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if processing_time: datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=tags)
def _record_metrics(tags, submission_type, response, timer=None, xform=None): if xform and xform.metadata: lag = xform.received_on - xform.metadata.timeEnd datadog_gauge('commcare.xform_submissions.lag', int(lag.total_seconds()), tags=tags) tags += [ 'submission_type:{}'.format(submission_type), 'status_code:{}'.format(response.status_code) ] if response.status_code == 201 and timer: tags += [ 'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'), ] datadog_counter('commcare.xform_submissions.count', tags=tags)
def celery_record_time_to_start(task_id=None, task=None, **kwargs): from corehq.util.datadog.gauges import datadog_gauge, datadog_counter tags = [ 'celery_task_name:{}'.format(task.name), 'celery_queue:{}'.format(task.queue), ] timer = TimeToStartTimer(task_id) try: time_to_start = timer.stop_and_pop_timing() except TimingNotAvailable: datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags) else: datadog_gauge('commcare.celery.task.time_to_start', time_to_start.total_seconds(), tags=tags) get_task_time_to_start.set_cached_value(task_id).to(time_to_start) TimeToRunTimer(task_id).start_timing()
def _record_change_in_datadog(self, change, timer): from corehq.apps.change_feed.consumer.feed import KafkaChangeFeed change_feed = self.get_change_feed() current_seq = self._normalize_sequence( change_feed.get_processed_offsets()) current_offsets = change_feed.get_latest_offsets() tags = [ 'pillow_name:{}'.format(self.get_name()), 'feed_type:{}'.format('kafka' if isinstance( change_feed, KafkaChangeFeed) else 'couch') ] for topic, value in current_seq.iteritems(): tags_with_topic = tags + [ _topic_for_ddog(topic), ] datadog_gauge('commcare.change_feed.processed_offsets', value, tags=tags_with_topic) if topic in current_offsets: needs_processing = current_offsets[topic] - value datadog_gauge('commcare.change_feed.need_processing', needs_processing, tags=tags_with_topic) for topic, offset in current_offsets.iteritems(): tags_with_topic = tags + [ _topic_for_ddog(topic), ] datadog_gauge('commcare.change_feed.current_offsets', offset, tags=tags_with_topic) self.__record_change_metric_in_datadog( 'commcare.change_feed.changes.count', change, timer)
def _record_change_in_datadog(self, change, timer): change_feed = self.get_change_feed() sequence = self._normalize_checkpoint_sequence() current_offsets = change_feed.get_current_offsets() for topic, value in sequence.iteritems(): datadog_gauge('commcare.change_feed.processed_offsets', value, tags=[ 'pillow_name:{}'.format(self.get_name()), 'topic:{}'.format(topic), ]) if topic in current_offsets: datadog_gauge('commcare.change_feed.need_processing', current_offsets[topic] - value, tags=[ 'pillow_name:{}'.format(self.get_name()), 'topic:{}'.format(topic), ]) for topic, offset in current_offsets.iteritems(): datadog_gauge('commcare.change_feed.current_offsets', offset, tags=[ 'pillow_name:{}'.format(self.get_name()), 'topic:{}'.format(topic), ]) self.__record_change_metric_in_datadog( 'commcare.change_feed.changes.count', change, timer)
def reprocess_archive_stubs(): # Check for archive stubs from corehq.form_processor.interfaces.dbaccessors import FormAccessors from couchforms.models import UnfinishedArchiveStub stubs = UnfinishedArchiveStub.objects.filter() datadog_gauge('commcare.unfinished_archive_stubs', len(stubs)) start = time.time() cutoff = start + timedelta(minutes=4).total_seconds() for stub in stubs: # Exit this task after 4 minutes so that the same stub isn't ever processed in multiple queues. if time.time() - start > cutoff: return xform = FormAccessors(stub.domain).get_form(form_id=stub.xform_id) # If the history wasn't updated the first time around, run the whole thing again. if not stub.history_updated: if stub.archive: xform.archive(user_id=stub.user_id) else: xform.unarchive(user_id=stub.user_id) # If the history was updated the first time around, just send the update to kafka else: xform.publish_archive_action_to_kafka(user_id=stub.user_id, archive=stub.archive)
def reprocess_archive_stubs(): # Check for archive stubs from corehq.form_processor.interfaces.dbaccessors import FormAccessors from couchforms.models import UnfinishedArchiveStub stubs = UnfinishedArchiveStub.objects.filter() datadog_gauge('commcare.unfinished_archive_stubs', len(stubs)) start = time.time() cutoff = start + timedelta(minutes=4).total_seconds() for stub in stubs: # Exit this task after 4 minutes so that the same stub isn't ever processed in multiple queues. if time.time() - start > cutoff: return xform = FormAccessors(stub.domain).get_form(form_id=stub.xform_id) # If the history wasn't updated the first time around, run the whole thing again. if not stub.history_updated: if stub.archive: xform.archive(user_id=stub.user_id) else: xform.unarchive(user_id=stub.user_id) # If the history was updated the first time around, just send the update to kafka else: xform.publish_archive_action_to_kafka(user_id=stub.user_id, archive=stub.archive)
def _record_datadog_metrics(self, changes_chunk, processing_time): tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"] # Since success/fail count is tracked per processor, to get sense of # actual operations count, multiply by number of processors count = len(changes_chunk) * len(self.processors) datadog_counter('commcare.change_feed.changes.count', count, tags=tags) max_change_lag = (datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds() min_change_lag = (datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags) datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags) # processing_time per change datadog_histogram( 'commcare.change_feed.processing_time', processing_time / len(changes_chunk), tags=tags + ["chunk_size:".format(str(len(changes_chunk)))] ) if len(changes_chunk) == self.processor_chunk_size: # don't report offset chunks to ease up datadog calculations datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time, tags=tags + ["chunk_size:{}".format(str(len(changes_chunk)))])
def restore(request, domain, app_id=None): """ We override restore because we have to supply our own user model (and have the domain in the url) """ if toggles.ENIKSHAY.enabled(domain): update_device_id(request.couch_user, request.GET.get('device_id')) response, timing_context = get_restore_response( domain, request.couch_user, app_id, **get_restore_params(request)) tags = [ u'status_code:{}'.format(response.status_code), ] datadog_counter('commcare.restores.count', tags=tags) if timing_context is not None: for timer in timing_context.to_list(exclude_root=True): # Only record leaf nodes so we can sum to get the total if timer.is_leaf_node: datadog_gauge( 'commcare.restores.timings', timer.duration, tags=tags + [u'segment:{}'.format(timer.name)], ) return response
def couch_sql_migration_stats(): result = (DomainES().filter(filters.term( 'use_sql_backend', False)).remove_default_filters().aggregations([ aggregations.SumAggregation('cases', 'cp_n_cases'), aggregations.SumAggregation('forms', 'cp_n_forms'), ]).size(0).run()) datadog_gauge('commcare.couch_sql_migration.domains_remaining', int(result.total)) datadog_gauge('commcare.couch_sql_migration.forms_remaining', int(result.aggregations.forms.value)) datadog_gauge('commcare.couch_sql_migration.cases_remaining', int(result.aggregations.cases.value))
def couch_sql_migration_stats(): result = ( DomainES() .filter(filters.term('use_sql_backend', False)) .remove_default_filters() .aggregations([ aggregations.SumAggregation('cases', 'cp_n_cases'), aggregations.SumAggregation('forms', 'cp_n_forms'), ]) .size(0).run() ) datadog_gauge('commcare.couch_sql_migration.domains_remaining', int(result.total)) datadog_gauge('commcare.couch_sql_migration.forms_remaining', int(result.aggregations.forms.value)) datadog_gauge('commcare.couch_sql_migration.cases_remaining', int(result.aggregations.cases.value))
def record_pillow_error_queue_size(): data = PillowError.objects.values('pillow').annotate(num_errors=Count('id')) for row in data: datadog_gauge('commcare.pillowtop.error_queue', row['num_errors'], tags=[ 'pillow_name:%s' % row['pillow'] ])
def _record_datadog_metrics(): count = UnfinishedSubmissionStub.objects.count() datadog_gauge('commcare.submission_reprocessing.queue_size', count)
def async_indicators_metrics(): now = datetime.utcnow() oldest_indicator = AsyncIndicator.objects.order_by('date_queued').first() if oldest_indicator and oldest_indicator.date_queued: lag = (now - oldest_indicator.date_queued).total_seconds() datadog_gauge('commcare.async_indicator.oldest_queued_indicator', lag) oldest_100_indicators = AsyncIndicator.objects.all()[:100] if oldest_100_indicators.exists(): oldest_indicator = oldest_100_indicators[0] lag = (now - oldest_indicator.date_created).total_seconds() datadog_gauge('commcare.async_indicator.oldest_created_indicator', lag) lags = [ (now - indicator.date_created).total_seconds() for indicator in oldest_100_indicators ] avg_lag = sum(lags) / len(lags) datadog_gauge('commcare.async_indicator.oldest_created_indicator_avg', avg_lag) for config_id, metrics in six.iteritems(_indicator_metrics()): tags = ["config_id:{}".format(config_id)] datadog_gauge('commcare.async_indicator.indicator_count', metrics['count'], tags=tags) datadog_gauge('commcare.async_indicator.lag', metrics['lag'], tags=tags) # Don't use ORM summing because it would attempt to get every value in DB unsuccessful_attempts = sum(AsyncIndicator.objects.values_list('unsuccessful_attempts', flat=True).all()[:100]) datadog_gauge('commcare.async_indicator.unsuccessful_attempts', unsuccessful_attempts)
def _process_form(request, domain, app_id, user_id, authenticated, auth_cls=AuthContext): if should_ignore_submission(request): # silently ignore submission if it meets ignore-criteria return SubmissionPost.submission_ignored_response() if toggles.FORM_SUBMISSION_BLACKLIST.enabled(domain): return SubmissionPost.get_blacklisted_response() try: instance, attachments = couchforms.get_instance_and_attachment(request) except MultimediaBug as e: try: instance = request.FILES[MAGIC_PROPERTY].read() xform = convert_xform_to_json(instance) meta = xform.get("meta", {}) except: meta = {} details = { "domain": domain, "app_id": app_id, "user_id": user_id, "authenticated": authenticated, "form_meta": meta, } log_counter(MULTIMEDIA_SUBMISSION_ERROR_COUNT, details) notify_exception(request, "Received a submission with POST.keys()", details) return HttpResponseBadRequest(e.message) app_id, build_id = get_app_and_build_ids(domain, app_id) submission_post = SubmissionPost( instance=instance, attachments=attachments, domain=domain, app_id=app_id, build_id=build_id, auth_context=auth_cls( domain=domain, user_id=user_id, authenticated=authenticated, ), location=couchforms.get_location(request), received_on=couchforms.get_received_on(request), date_header=couchforms.get_date_header(request), path=couchforms.get_path(request), submit_ip=couchforms.get_submit_ip(request), last_sync_token=couchforms.get_last_sync_token(request), openrosa_headers=couchforms.get_openrosa_headers(request), ) with TimingContext() as timer: result = submission_post.run() response = result.response tags = [ 'backend:sql' if should_use_sql_backend(domain) else 'backend:couch', u'domain:{}'.format(domain) ] datadog_counter('commcare.xform_submissions.count', tags=tags + ['status_code:{}'.format(response.status_code)]) if response.status_code == 400: logging.error('Status code 400 for a form submission. ' 'Response is: \n{0}\n') elif response.status_code == 201: datadog_gauge('commcare.xform_submissions.timings', timer.duration, tags=tags) # normalize over number of items (form or case) saved normalized_time = timer.duration / (1 + len(result.cases)) datadog_gauge('commcare.xform_submissions.normalized_timings', normalized_time, tags=tags) datadog_counter('commcare.xform_submissions.case_count', len(result.cases), tags=tags) datadog_counter('commcare.xform_submissions.ledger_count', len(result.ledgers), tags=tags) return response