def fetch_all(initial_response): resp = initial_response scroll_id = resp.get('_scroll_id') if scroll_id is None: return iteration = 0 while True: start = int(time.time() * 1000) resp = client.scroll(scroll_id, scroll=scroll) datadog_histogram('commcare.es_scroll', (time.time() * 1000) - start, tags=[ u'iteration:{}'.format(iteration), ]) for hit in resp['hits']['hits']: yield hit # check if we have any errrors if resp["_shards"]["failed"]: logging.getLogger('elasticsearch.helpers').warning( 'Scroll request has failed on %d shards out of %d.', resp['_shards']['failed'], resp['_shards']['total'] ) scroll_id = resp.get('_scroll_id') # end of scroll if scroll_id is None or not resp['hits']['hits']: break iteration += 1
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids related_docs_to_rebuild = set() with timer: for doc in doc_store.iter_documents(indicator_by_doc_id.keys()): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove, rebuild_related_docs = _save_document_helper( indicator, doc) if rebuild_related_docs: related_docs_to_rebuild = related_docs_to_rebuild.union( icds_get_related_docs_ids(doc['_id'])) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) num_processed = len(processed_indicators) num_failed = len(failed_indicators) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() # remove any related docs that were just rebuilt related_docs_to_rebuild = related_docs_to_rebuild - set(doc_ids) # queue the docs that aren't already queued _queue_indicators( AsyncIndicator.objects.filter(doc_id__in=related_docs_to_rebuild, date_queued=None)) datadog_counter('commcare.async_indicator.processed_success', num_processed) datadog_counter('commcare.async_indicator.processed_fail', num_failed) datadog_histogram('commcare.async_indicator.processing_time', timer.duration, tags=[u'config_ids:{}'.format(indicator_config_ids)])
def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None, add_case_type_tag=False): if change.metadata is not None: common_tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ] metric_tags = list(common_tags) if add_case_type_tag and settings.ENTERPRISE_MODE and change.metadata.document_type == 'CommCareCase': metric_tags.append('case_type:{}'.format( change.metadata.document_subtype)) datadog_counter(metric, tags=metric_tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if processing_time: datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=common_tags)
def _record_datadog_metrics(self, changes_chunk, processing_time): tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"] # Since success/fail count is tracked per processor, to get sense of # actual operations count, multiply by number of processors count = len(changes_chunk) * len(self.processors) datadog_counter('commcare.change_feed.changes.count', count, tags=tags) max_change_lag = ( datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds() min_change_lag = ( datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags) datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags) datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time, tags=tags + ["chunk_size:{}".format(str(len(changes_chunk)))]) if len(changes_chunk) == self.processor_chunk_size: # don't report offset chunks to ease up datadog calculations datadog_histogram('commcare.change_feed.processing_time', processing_time / len(changes_chunk), tags=tags + ["chunk_size:".format(str(len(changes_chunk)))])
def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None): if change.metadata is not None: tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ] count = 1 if processor else len(self.processors) datadog_counter(metric, value=count, tags=tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if processing_time: datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=tags)
def _record_datadog_metrics(self, changes_chunk, processing_time): tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"] change_count = len(changes_chunk) if settings.ENTERPRISE_MODE: type_counter = Counter([ change.metadata.document_subtype for change in changes_chunk if change.metadata.document_type == 'CommCareCase' ]) for case_type, type_count in type_counter.items(): tags_with_type = tags + ['case_type:{}'.format(case_type)] datadog_counter('commcare.change_feed.changes.count', type_count, tags=tags_with_type) remainder = change_count - sum(type_counter.values()) if remainder: datadog_counter('commcare.change_feed.changes.count', remainder, tags=tags) else: datadog_counter('commcare.change_feed.changes.count', change_count, tags=tags) max_change_lag = (datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds() min_change_lag = (datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags) datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags) # processing_time per change datadog_histogram('commcare.change_feed.processing_time', processing_time / change_count, tags=tags) if change_count == self.processor_chunk_size: # don't report offset chunks to ease up datadog calculations datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time, tags=tags + ["chunk_size:{}".format(str(change_count))])
def custom_data(self, selected_location, domain): timer = TimingContext() with timer: to_ret = self._custom_data(selected_location, domain) if selected_location: loc_type = selected_location.location_type.name else: loc_type = None datadog_histogram("commcare.icds.block_reports.custom_data_time", timer.duration, tags="location_type:{}, report_slug:{}".format( loc_type, self.slug)) return to_ret
def checkpoint_updated(self): total_duration = sum(self.domain_timing_context.values()) duration_seen = 0 top_half_domains = {} for domain, duration in self.domain_timing_context.most_common(): top_half_domains[domain] = duration duration_seen += duration if duration_seen >= total_duration // 2: break for domain, duration in top_half_domains.items(): datadog_histogram('commcare.change_feed.ucr_slow_log', duration, tags=[ 'domain:{}'.format(domain) ]) self.domain_timing_context.clear()
def custom_data(self, selected_location, domain): timer = TimingContext() with timer: to_ret = self._custom_data(selected_location, domain) if selected_location: loc_type = selected_location.location_type.name else: loc_type = None datadog_histogram( "commcare.icds.block_reports.custom_data_time", timer.duration, tags="location_type:{}, report_slug:{}".format( loc_type, self.slug ) ) return to_ret
def custom_data(self, selected_location, domain): timer = TimingContext() with timer: to_ret = self._custom_data(selected_location, domain) if selected_location: loc_type = selected_location.location_type.name else: loc_type = None tags = ["location_type:{}".format(loc_type), "report_slug:{}".format(self.slug)] if self.allow_conditional_agg: tags.append("allow_conditional_agg:yes") datadog_histogram( "commcare.icds.block_reports.custom_data_duration", timer.duration, tags=tags ) return to_ret
def __record_change_metric_in_datadog(self, metric, change, timer=None): if change.metadata is not None: tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), ] datadog_counter(metric, tags=tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).seconds datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if timer: datadog_histogram('commcare.change_feed.processing_time', timer.duration, tags=tags)
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids with timer: for doc in doc_store.iter_documents(doc_ids): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove = _save_document_helper(indicator, doc) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() datadog_histogram( 'commcare.async_indicator.processing_time', timer.duration, tags=[ u'config_ids:{}'.format(indicator_config_ids) ] )
def restore(request, domain, app_id=None): """ We override restore because we have to supply our own user model (and have the domain in the url) """ response, timing_context = get_restore_response( domain, request.couch_user, app_id, **get_restore_params(request)) tags = [ u'status_code:{}'.format(response.status_code), ] datadog_counter('commcare.restores.count', tags=tags) if timing_context is not None: for timer in timing_context.to_list(exclude_root=True): # Only record leaf nodes so we can sum to get the total if timer.is_leaf_node: datadog_histogram( 'commcare.restores.timings', timer.duration, tags=tags + [u'segment:{}'.format(timer.name)], ) return response
def _record_datadog_metrics(self, changes_chunk, processing_time): tags = ["pillow_name:{}".format(self.get_name()), "mode:chunked"] # Since success/fail count is tracked per processor, to get sense of # actual operations count, multiply by number of processors count = len(changes_chunk) * len(self.processors) datadog_counter('commcare.change_feed.changes.count', count, tags=tags) max_change_lag = (datetime.utcnow() - changes_chunk[0].metadata.publish_timestamp).total_seconds() min_change_lag = (datetime.utcnow() - changes_chunk[-1].metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.chunked.min_change_lag', min_change_lag, tags=tags) datadog_gauge('commcare.change_feed.chunked.max_change_lag', max_change_lag, tags=tags) # processing_time per change datadog_histogram( 'commcare.change_feed.processing_time', processing_time / len(changes_chunk), tags=tags + ["chunk_size:".format(str(len(changes_chunk)))] ) if len(changes_chunk) == self.processor_chunk_size: # don't report offset chunks to ease up datadog calculations datadog_histogram('commcare.change_feed.chunked.processing_time_total', processing_time, tags=tags + ["chunk_size:{}".format(str(len(changes_chunk)))])
def _record_metrics(base_tags, submission_type, response, result=None, timer=None): base_tags += 'submission_type:{}'.format(submission_type), datadog_counter('commcare.xform_submissions.count', tags=base_tags + ['status_code:{}'.format(response.status_code)]) if response.status_code == 201 and timer and result: datadog_histogram('commcare.xform_submissions.timings', timer.duration, tags=base_tags) # normalize over number of items (form or case) saved normalized_time = timer.duration / (1 + len(result.cases)) datadog_histogram('commcare.xform_submissions.normalized_timings', normalized_time, tags=base_tags) datadog_counter('commcare.xform_submissions.case_count', len(result.cases), tags=base_tags) datadog_counter('commcare.xform_submissions.ledger_count', len(result.ledgers), tags=base_tags)
def __record_datadog_export(duration, doc_bytes, n_rows, metric, tags): datadog_histogram(metric, duration, tags=tags) if doc_bytes: datadog_histogram( '{}_normalized_by_size'.format(metric), duration / doc_bytes, tags=tags, ) if n_rows: datadog_histogram( '{}_normalized_by_rows'.format(metric), duration / n_rows, tags=tags, )
def __record_datadog_export(duration, doc_bytes, n_rows, metric, tags): datadog_histogram(metric, duration, tags=tags) if doc_bytes: datadog_histogram( '{}_normalized_by_size'.format(metric), duration // doc_bytes, tags=tags, ) if n_rows: datadog_histogram( '{}_normalized_by_rows'.format(metric), duration // n_rows, tags=tags, )
def __record_datadog_export(duration, doc_bytes, n_rows, metric, tags): # deprecated: datadog histograms used this way are usually meaningless # because they are aggregated on a 10s window (so unless you're constantly # processing at least 100 measurements per 10s window then the percentile # will approximate the average). Also more nuanced reasons. See # https://confluence.dimagi.com/x/4ArDAQ#TroubleshootingPasteboard/HQchatdumpingground-Datadog datadog_histogram(metric, duration, tags=tags) if doc_bytes: datadog_histogram( '{}_normalized_by_size'.format(metric), duration // doc_bytes, tags=tags, ) if n_rows: datadog_histogram( '{}_normalized_by_rows'.format(metric), duration // n_rows, tags=tags, )
def _custom_data(self, selected_location, domain): data = {} for config in self.sources['data_source']: filters = {} if selected_location: key = selected_location.location_type.name.lower() + '_id' filters = { key: [Choice(value=selected_location.location_id, display=selected_location.name)] } if 'date_filter_field' in config: filters.update({config['date_filter_field']: self.config['date_span']}) if 'filter' in config: for fil in config['filter']: if 'type' in fil: now = datetime.now() start_date = now if 'start' not in fil else now - timedelta(days=fil['start']) end_date = now if 'end' not in fil else now - timedelta(days=fil['end']) datespan = DateSpan(start_date, end_date) filters.update({fil['column']: datespan}) else: filters.update({ fil['column']: { 'operator': fil['operator'], 'operand': fil['value'] } }) timer = TimingContext() with timer: report_data = ICDSData(domain, filters, config['id']).data() if selected_location: loc_type = selected_location.location_type.name else: loc_type = None datadog_histogram( "commcare.icds.block_reports.ucr_querytime", timer.duration, tags="config:{}, location_type:{}, report_slug:{}".format( config['id'], loc_type, self.slug ) ) for column in config['columns']: column_agg_func = column['agg_fun'] column_name = column['column_name'] column_data = 0 if column_agg_func == 'sum': column_data = sum([x.get(column_name, 0) or 0 for x in report_data]) elif column_agg_func == 'count': column_data = len(report_data) elif column_agg_func == 'count_if': value = column['condition']['value'] op = column['condition']['operator'] def check_condition(v): if isinstance(v, six.string_types): soft_assert_type_text(v) fil_v = str(value) elif isinstance(v, int): fil_v = int(value) else: fil_v = value if op == "in": return OPERATORS[op](fil_v, v) else: return OPERATORS[op](v, fil_v) column_data = len([val for val in report_data if check_condition(val[column_name])]) elif column_agg_func == 'avg': values = [x.get(column_name, 0) for x in report_data] column_data = sum(values) / (len(values) or 1) column_display = column_name if 'column_in_report' not in column else column['column_in_report'] data.update({ column_display: data.get(column_display, 0) + column_data }) return data
def _build_async_indicators(indicator_doc_ids): def handle_exception(exception, config_id, doc, adapter): metric = None if isinstance(exception, (ProtocolError, ReadTimeout)): metric = 'commcare.async_indicator.riak_error' elif isinstance(exception, (ESError, ConnectionTimeout)): # a database had an issue so log it and go on to the next document metric = 'commcare.async_indicator.es_error' elif isinstance(exception, (DatabaseError, InternalError)): # a database had an issue so log it and go on to the next document metric = 'commcare.async_indicator.psql_error' else: # getting the config could fail before the adapter is set if adapter: adapter.handle_exception(doc, exception) if metric: datadog_counter(metric, 1, tags={ 'config_id': config_id, 'doc_id': doc['_id'] }) def doc_ids_from_rows(rows): formatted_rows = [{ column.column.database_column_name.decode('utf-8'): column.value for column in row } for row in rows] return set(row['doc_id'] for row in formatted_rows) # tracks processed/deleted configs to be removed from each indicator configs_to_remove_by_indicator_id = defaultdict(list) def _mark_config_to_remove(config_id, indicator_ids): for _id in indicator_ids: configs_to_remove_by_indicator_id[_id].append(config_id) timer = TimingContext() lock_keys = [ get_async_indicator_modify_lock_key(indicator_id) for indicator_id in indicator_doc_ids ] with CriticalSection(lock_keys): all_indicators = AsyncIndicator.objects.filter( doc_id__in=indicator_doc_ids) if not all_indicators: return doc_store = get_document_store_for_doc_type(all_indicators[0].domain, all_indicators[0].doc_type) failed_indicators = set() rows_to_save_by_adapter = defaultdict(list) indicator_by_doc_id = {i.doc_id: i for i in all_indicators} config_ids = set() with timer: for doc in doc_store.iter_documents( list(indicator_by_doc_id.keys())): indicator = indicator_by_doc_id[doc['_id']] eval_context = EvaluationContext(doc) for config_id in indicator.indicator_config_ids: config_ids.add(config_id) try: config = _get_config_by_id(config_id) except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError): celery_task_logger.info( "{} no longer exists, skipping".format(config_id)) # remove because the config no longer exists _mark_config_to_remove(config_id, [indicator.pk]) continue except ESError: celery_task_logger.info( "ES errored when trying to retrieve config") failed_indicators.add(indicator) continue adapter = None try: adapter = get_indicator_adapter(config) rows_to_save_by_adapter[adapter].extend( adapter.get_all_values(doc, eval_context)) eval_context.reset_iteration() except Exception as e: failed_indicators.add(indicator) handle_exception(e, config_id, doc, adapter) for adapter, rows in six.iteritems(rows_to_save_by_adapter): doc_ids = doc_ids_from_rows(rows) indicators = [ indicator_by_doc_id[doc_id] for doc_id in doc_ids ] try: adapter.save_rows(rows) except Exception as e: failed_indicators.union(indicators) message = six.text_type(e) notify_exception( None, "Exception bulk saving async indicators:{}".format( message)) else: # remove because it's sucessfully processed _mark_config_to_remove(config_id, [i.pk for i in indicators]) # delete fully processed indicators processed_indicators = set(all_indicators) - failed_indicators AsyncIndicator.objects.filter( pk__in=[i.pk for i in processed_indicators]).delete() # update failure for failed indicators with transaction.atomic(): for indicator in failed_indicators: indicator.update_failure( configs_to_remove_by_indicator_id.get(indicator.pk, [])) indicator.save() datadog_counter('commcare.async_indicator.processed_success', len(processed_indicators)) datadog_counter('commcare.async_indicator.processed_fail', len(failed_indicators)) datadog_histogram('commcare.async_indicator.processing_time', timer.duration / len(indicator_doc_ids), tags=[ 'config_ids:{}'.format(config_ids), ])
def _build_async_indicators(indicator_doc_ids): def handle_exception(exception, config_id, doc, adapter): metric = None if isinstance(exception, (ProtocolError, ReadTimeout)): metric = 'commcare.async_indicator.riak_error' elif isinstance(exception, (ESError, ConnectionTimeout)): # a database had an issue so log it and go on to the next document metric = 'commcare.async_indicator.es_error' elif isinstance(exception, (DatabaseError, InternalError)): # a database had an issue so log it and go on to the next document metric = 'commcare.async_indicator.psql_error' else: # getting the config could fail before the adapter is set if adapter: adapter.handle_exception(doc, exception) if metric: datadog_counter(metric, 1, tags={'config_id': config_id, 'doc_id': doc['_id']}) def doc_ids_from_rows(rows): formatted_rows = [ {column.column.database_column_name.decode('utf-8'): column.value for column in row} for row in rows ] return set(row['doc_id'] for row in formatted_rows) # tracks processed/deleted configs to be removed from each indicator configs_to_remove_by_indicator_id = defaultdict(list) def _mark_config_to_remove(config_id, indicator_ids): for _id in indicator_ids: configs_to_remove_by_indicator_id[_id].append(config_id) timer = TimingContext() lock_keys = [ get_async_indicator_modify_lock_key(indicator_id) for indicator_id in indicator_doc_ids ] with CriticalSection(lock_keys): all_indicators = AsyncIndicator.objects.filter( doc_id__in=indicator_doc_ids ) if not all_indicators: return doc_store = get_document_store_for_doc_type( all_indicators[0].domain, all_indicators[0].doc_type, load_source="build_async_indicators", ) failed_indicators = set() rows_to_save_by_adapter = defaultdict(list) indicator_by_doc_id = {i.doc_id: i for i in all_indicators} config_ids = set() with timer: for doc in doc_store.iter_documents(list(indicator_by_doc_id.keys())): indicator = indicator_by_doc_id[doc['_id']] eval_context = EvaluationContext(doc) for config_id in indicator.indicator_config_ids: config_ids.add(config_id) try: config = _get_config_by_id(config_id) except (ResourceNotFound, StaticDataSourceConfigurationNotFoundError): celery_task_logger.info("{} no longer exists, skipping".format(config_id)) # remove because the config no longer exists _mark_config_to_remove(config_id, [indicator.pk]) continue except ESError: celery_task_logger.info("ES errored when trying to retrieve config") failed_indicators.add(indicator) continue adapter = None try: adapter = get_indicator_adapter(config, load_source='build_async_indicators') rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context)) eval_context.reset_iteration() except Exception as e: failed_indicators.add(indicator) handle_exception(e, config_id, doc, adapter) for adapter, rows in six.iteritems(rows_to_save_by_adapter): doc_ids = doc_ids_from_rows(rows) indicators = [indicator_by_doc_id[doc_id] for doc_id in doc_ids] try: adapter.save_rows(rows) except Exception as e: failed_indicators.union(indicators) message = six.text_type(e) notify_exception(None, "Exception bulk saving async indicators:{}".format(message)) else: # remove because it's sucessfully processed _mark_config_to_remove( config_id, [i.pk for i in indicators] ) # delete fully processed indicators processed_indicators = set(all_indicators) - failed_indicators AsyncIndicator.objects.filter(pk__in=[i.pk for i in processed_indicators]).delete() # update failure for failed indicators with transaction.atomic(): for indicator in failed_indicators: indicator.update_failure( configs_to_remove_by_indicator_id.get(indicator.pk, []) ) indicator.save() datadog_counter('commcare.async_indicator.processed_success', len(processed_indicators)) datadog_counter('commcare.async_indicator.processed_fail', len(failed_indicators)) datadog_histogram( 'commcare.async_indicator.processing_time', timer.duration / len(indicator_doc_ids), tags=[ 'config_ids:{}'.format(config_ids), ] )