def check_repeaters(): start = datetime.utcnow() six_hours_sec = 6 * 60 * 60 six_hours_later = start + timedelta(seconds=six_hours_sec) # Long timeout to allow all waiting repeat records to be iterated check_repeater_lock = get_redis_lock( CHECK_REPEATERS_KEY, timeout=six_hours_sec, name=CHECK_REPEATERS_KEY, ) if not check_repeater_lock.acquire(blocking=False): datadog_counter("commcare.repeaters.check.locked_out") return try: with datadog_bucket_timer( "commcare.repeaters.check.processing", tags=[], timing_buckets=_check_repeaters_buckets, ): for record in iterate_repeat_records(start): if datetime.utcnow() > six_hours_later: _soft_assert(False, "I've been iterating repeat records for six hours. I quit!") break datadog_counter("commcare.repeaters.check.attempt_forward") record.attempt_forward_now() finally: check_repeater_lock.release()
def set_cached_payload_if_necessary(self, fileobj, duration, is_async): # only cache if the duration was longer than the threshold is_long_restore = duration > timedelta(seconds=INITIAL_SYNC_CACHE_THRESHOLD) if is_async or self.force_cache or is_long_restore: type_ = 'unknown' if is_async: type_ = 'async' elif self.force_cache: type_ = 'force' elif is_long_restore: type_ = 'long' tags = { 'type:{}'.format(type_), } datadog_counter('commcare.restores.cache_writes', tags=tags) response = CachedResponse.save_for_later( fileobj, self.cache_timeout, self.domain, self.restore_user.user_id, ) self.restore_payload_path_cache.set_value(response.name, self.cache_timeout) return response return None
def __del__(self): if self.track_unreleased and self.lock_timer.is_started(): datadog_counter("commcare.lock.not_released", tags=self.tags) if self.lock_trace is not None: self.lock_trace.set_tag("deleted", "not_released") self.lock_trace.finish() self.lock_trace = None
def delete_old_images(): start = datetime.utcnow() max_age = start - timedelta(days=90) db = get_blob_db() def _get_query(db_name, max_age=max_age): return BlobMeta.objects.using(db_name).filter( content_type='image/jpeg', type_code=CODES.form_attachment, domain='icds-cas', created_on__lt=max_age ) run_again = False for db_name in get_db_aliases_for_partitioned_query(): bytes_deleted = 0 metas = list(_get_query(db_name)[:1000]) if metas: for meta in metas: bytes_deleted += meta.content_length or 0 db.bulk_delete(metas=metas) datadog_counter('commcare.icds_images.bytes_deleted', value=bytes_deleted) datadog_counter('commcare.icds_images.count_deleted', value=len(metas)) run_again = True if run_again: delete_old_images.delay()
def reconcile_actions_if_necessary(self, xform): if not self.check_action_order(): datadog_counter("commcare.form_processor.couch.reconcile_actions") try: self.reconcile_actions(rebuild=True, xforms={xform.form_id: xform}) except ReconciliationError: pass
def _record_batch_exception_in_datadog(self, processor): datadog_counter( "commcare.change_feed.batch_processor_exceptions", tags=[ 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ])
def degraded(self): """Indicate that the lock has "degraded gracefully" The lock was not acquired, but processing continued as if it had been acquired. """ datadog_counter("commcare.lock.degraded", tags=self.tags)
def report_shard_failures(search_result): """Report es shard failures to datadog """ if not isinstance(search_result, dict): return if search_result.get('_shards', {}).get('failed'): datadog_counter('commcare.es.partial_results', value=1)
def reprocess_submission(submssion_stub_id): with CriticalSection(['reprocess_submission_%s' % submssion_stub_id]): try: stub = UnfinishedSubmissionStub.objects.get(id=submssion_stub_id) except UnfinishedSubmissionStub.DoesNotExist: return reprocess_unfinished_stub(stub) datadog_counter('commcare.submission_reprocessing.count')
def maybe_not_found(throw=None): try: yield except ClientError as err: if not is_not_found(err): raise datadog_counter('commcare.blobdb.notfound') if throw is not None: raise throw
def release(self): self.lock.release() if self.lock_timer.is_started(): self.lock_timer.stop() if self.end_time and time.time() > self.end_time: datadog_counter("commcare.lock.released_after_timeout", tags=self.tags) if self.lock_trace is not None: self.lock_trace.finish() self.lock_trace = None
def _record_checkpoint_in_datadog(self): datadog_counter('commcare.change_feed.change_feed.checkpoint', tags=[ 'pillow_name:{}'.format(self.get_name()), ]) checkpoint_sequence = self._normalize_checkpoint_sequence() for topic, value in six.iteritems(checkpoint_sequence): datadog_gauge('commcare.change_feed.checkpoint_offsets', value, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(topic), ])
def reconcile_transactions_if_necessary(self): if self.case.check_transaction_order(): return False datadog_counter("commcare.form_processor.sql.reconciling_transactions") try: self.reconcile_transactions() except ReconciliationError as e: reconciliation_soft_assert(False, "ReconciliationError: %s" % e.message) return True
def commit_migration(domain_name): domain_obj = Domain.get_by_name(domain_name, strict=True) domain_obj.use_sql_backend = True domain_obj.save() clear_local_domain_sql_backend_override(domain_name) if not should_use_sql_backend(domain_name): Domain.get_by_name.clear(Domain, domain_name) assert should_use_sql_backend(domain_name) datadog_counter("commcare.couch_sql_migration.total_committed") _logger.info("committed migration for {}".format(domain_name))
def _log_processed_docs_count(self, tags, throttled=False): if throttled and self.processed_docs < 100: return processed_docs = self.processed_docs self.processed_docs = 0 datadog_counter("commcare.couchsqlmigration.processed_docs", value=processed_docs, tags=tags)
def get_context(self, record): from corehq.util.datadog.gauges import datadog_counter try: request = record.request except Exception: request = None request_repr = get_sanitized_request_repr(request) tb_list = [] code = None if record.exc_info: etype, _value, tb = record.exc_info value = clean_exception(_value) tb_list = ['Traceback (most recent call first):\n'] formatted_exception = traceback.format_exception_only(etype, value) tb_list.extend(formatted_exception) extracted_tb = list(reversed(traceback.extract_tb(tb))) code = self.get_code(extracted_tb) tb_list.extend(traceback.format_list(extracted_tb)) stack_trace = '\n'.join(tb_list) subject = '%s: %s' % (record.levelname, formatted_exception[0].strip() if formatted_exception else record.getMessage()) else: stack_trace = 'No stack trace available' subject = '%s: %s' % ( record.levelname, record.getMessage() ) context = defaultdict(lambda: '') context.update({ 'subject': self.format_subject(subject), 'message': record.getMessage(), 'details': getattr(record, 'details', None), 'tb_list': tb_list, 'request_repr': request_repr, 'stack_trace': stack_trace, 'code': code, }) if request: sanitized_url = sanitize_url(request.build_absolute_uri()) datadog_counter(ERROR_COUNT, tags=[ 'url:{}'.format(sanitized_url), 'group:{}'.format(get_url_group(sanitized_url)), 'domain:{}'.format(getattr(request, 'domain', DATADOG_UNKNOWN)), ]) context.update({ 'get': list(request.GET.items()), 'post': SafeExceptionReporterFilter().get_post_parameters(request), 'method': request.method, 'username': request.user.username if getattr(request, 'user', None) else "", 'url': request.build_absolute_uri(), }) return context
def _inner(*args, **kwargs): response = fn(*args, **kwargs) try: datadog_counter( metric_name, tags=['status_code:{}'.format(response.status_code)]) except Exception: datadog_logger.exception('Unable to record Datadog stats') return response
def _inner(*args, **kwargs): response = fn(*args, **kwargs) try: datadog_counter(metric_name, tags=[ 'status_code:{}'.format(response.status_code) ]) except Exception: datadog_logger.exception('Unable to record Datadog stats') return response
def reconcile_transactions_if_necessary(self): if self.case.check_transaction_order(): return False datadog_counter("commcare.form_processor.sql.reconcile_transactions") try: self.reconcile_transactions() except ReconciliationError as e: reconciliation_soft_assert( False, "ReconciliationError: %s" % six.text_type(e)) return True
def _s3_bucket(self, create=False): if create and not self._s3_bucket_exists: try: self.db.meta.client.head_bucket(Bucket=self.s3_bucket_name) except ClientError as err: if not is_not_found(err): datadog_counter('commcare.blobdb.notfound') raise self.db.create_bucket(Bucket=self.s3_bucket_name) self._s3_bucket_exists = True return self.db.Bucket(self.s3_bucket_name)
def bulk_delete(self, metas): """Delete blob metadata in bulk :param metas: A list of `BlobMeta` objects. """ if any(meta.id is None for meta in metas): raise ValueError("cannot delete unsaved BlobMeta") delete_blobs_sql = """ WITH deleted AS ( DELETE FROM blobs_blobmeta WHERE id IN %s RETURNING * ), ins AS ( INSERT INTO blobs_deletedblobmeta ( "id", "domain", "parent_id", "name", "key", "type_code", "created_on", "deleted_on" ) ( SELECT "id", "domain", "parent_id", "name", "key", "type_code", "created_on", %s AS "deleted_on" FROM deleted WHERE expires_on IS NULL ) ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name, key = EXCLUDED.key, type_code = EXCLUDED.type_code, created_on = EXCLUDED.created_on, deleted_on = CLOCK_TIMESTAMP() WHERE blobs_deletedblobmeta.parent_id = EXCLUDED.parent_id and blobs_deletedblobmeta.key = EXCLUDED.key ) SELECT COUNT(*) FROM deleted; """ now = _utcnow() parents = defaultdict(list) for meta in metas: parents[meta.parent_id].append(meta.id) for dbname, split_parent_ids in split_list_by_db_partition(parents): ids = tuple(m for p in split_parent_ids for m in parents[p]) with BlobMeta.get_cursor_for_partition_db(dbname) as cursor: cursor.execute(delete_blobs_sql, [ids, now]) deleted_bytes = sum(m.content_length for m in metas) datadog_counter('commcare.blobs.deleted.count', value=len(metas)) datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
def commit_migration(domain_name): domain_obj = Domain.get_by_name(domain_name, strict=True) domain_obj.use_sql_backend = True domain_obj.save() clear_local_domain_sql_backend_override(domain_name) if not should_use_sql_backend(domain_name): Domain.get_by_name.clear(Domain, domain_name) assert should_use_sql_backend(domain_name), \ "could not set use_sql_backend for domain %s (try again)" % domain_name datadog_counter("commcare.couch_sql_migration.total_committed") log.info("committed migration for {}".format(domain_name))
def _rate_limit_exc(exc_info): exc_type, exc_value, tb = exc_info rate_limit_key = _get_rate_limit_key(exc_info) if not rate_limit_key: return False datadog_counter('commcare.sentry.errors.rate_limited', tags=['service:{}'.format(rate_limit_key)]) if is_pg_cancelled_query_exception(exc_value): datadog_counter('hq_custom.postgres.standby_query_canellations') exponential_backoff_key = '{}_down'.format(rate_limit_key) return is_rate_limited(exponential_backoff_key)
def _record_metrics(tags, submission_type, response, timer=None): tags += [ 'submission_type:{}'.format(submission_type), 'status_code:{}'.format(response.status_code) ] if response.status_code == 201 and timer: tags += [ 'duration:%s' % bucket_value(timer.duration, (1, 5, 20, 60, 120, 300, 600), 's'), ] datadog_counter('commcare.xform_submissions.count', tags=tags)
def get(self, identifier=None, bucket=DEFAULT_BUCKET, key=None): if identifier is None and bucket == DEFAULT_BUCKET: path = self.get_path(key=key) else: # legacy: can be removed with old API assert key is None, key key = join(bucket, identifier) path = self.get_path(identifier, bucket) if not exists(path): datadog_counter('commcare.blobdb.notfound') raise NotFound(key) return open(path, "rb")
def _s3_bucket(self, create=False): if create and not self._s3_bucket_exists: try: with self.report_timing('head_bucket', self.s3_bucket_name): self.db.meta.client.head_bucket(Bucket=self.s3_bucket_name) except ClientError as err: if not is_not_found(err): datadog_counter('commcare.blobdb.notfound') raise with self.report_timing('create_bucket', self.s3_bucket_name): self.db.create_bucket(Bucket=self.s3_bucket_name) self._s3_bucket_exists = True return self.db.Bucket(self.s3_bucket_name)
def _rate_limit_submission(domain): allow_usage = submission_rate_limiter.allow_usage(domain) if allow_usage: submission_rate_limiter.report_usage(domain) else: datadog_counter('commcare.xform_submissions.rate_limited', tags=[ 'domain:{}'.format(domain), ]) return not allow_usage
def delete(self, key, content_length): """Delete blob metadata Metadata for temporary blobs is deleted. Non-temporary metadata is retained to make it easier to track down missing blobs. :param key: Blob key string. :returns: The number of metadata rows deleted. """ with get_cursor(BlobMeta) as cursor: cursor.execute('SELECT 1 FROM delete_blob_meta(%s)', [key]) datadog_counter('commcare.blobs.deleted.count') datadog_counter('commcare.blobs.deleted.bytes', value=content_length)
def _send_timings(self, timing_context): metric_name_template = "commcare.%s.count" metric_name_template_normalized = "commcare.%s.count.normalized" for timing in timing_context.to_list(): datadog_counter( metric_name_template % timing.full_name, tags=['duration:%s' % bucket_value(timing.duration, TIMING_BUCKETS)]) normalize_denominator = getattr(timing, 'normalize_denominator', None) if normalize_denominator: datadog_counter( metric_name_template_normalized % timing.full_name, tags=['duration:%s' % bucket_value(timing.duration / normalize_denominator, NORMALIZED_TIMING_BUCKETS)])
def rate_limit_submission_by_delaying(domain, max_wait): if not submission_rate_limiter.allow_usage(domain): with TimingContext() as timer: acquired = submission_rate_limiter.wait(domain, timeout=max_wait) if acquired: duration_tag = bucket_value(timer.duration, [1, 5, 10, 15, 20], unit='s') else: duration_tag = 'timeout' datadog_counter('commcare.xform_submissions.rate_limited.test', tags=[ 'domain:{}'.format(domain), 'duration:{}'.format(duration_tag) ]) submission_rate_limiter.report_usage(domain)
def _rate_limit_restore(domain): allow_usage = restore_rate_limiter.allow_usage(domain) if allow_usage: restore_rate_limiter.report_usage(domain) else: datadog_counter('commcare.restore.rate_limited', tags=[ 'domain:{}'.format(domain), ]) return not allow_usage
def rate_limit_two_factor_setup(device): """ This holds attempts per user AND attempts per IP below limits given by two_factor_setup_rate_limiter. And keeps total requests below limits given by global_two_factor_setup_rate_limiter. Requests without an IP are rejected (unusual). If a device has no username attached or if it is not a PhoneDevice, then those requests are also rejected. """ _status_rate_limited = 'rate_limited' _status_bad_request = 'bad_request' _status_accepted = 'accepted' def get_ip_address(): request = get_request() if request: return get_ip(request) else: return None _report_current_global_two_factor_setup_rate_limiter() ip_address = get_ip_address() username = device.user.username method = device.method if isinstance(device, PhoneDevice) else None if ip_address and username and method: if two_factor_setup_rate_limiter.allow_usage('ip:{}'.format(ip_address)) \ and two_factor_setup_rate_limiter.allow_usage('user:{}'.format(username)) \ and global_two_factor_setup_rate_limiter.allow_usage(): two_factor_setup_rate_limiter.report_usage( 'ip:{}'.format(ip_address)) two_factor_setup_rate_limiter.report_usage( 'user:{}'.format(username)) global_two_factor_setup_rate_limiter.report_usage() status = _status_accepted else: status = _status_rate_limited else: status = _status_bad_request datadog_counter('commcare.two_factor.setup_requests', 1, tags=[ 'status:{}'.format(status), 'method:{}'.format(method), ]) return status != _status_accepted
def silence_and_report_error(message, datadog_metric): """ Prevent a piece of code from ever causing 500s if it errors Instead, report the issue to sentry and track the overall count on datadog """ try: yield except Exception: notify_exception(None, message) datadog_counter(datadog_metric) if settings.UNIT_TESTING: raise
def _record_metrics(tags, submission_type, response, result=None, timer=None): tags += [ 'submission_type:{}'.format(submission_type), 'status_code:{}'.format(response.status_code) ] if response.status_code == 201 and timer and result: tags += [ 'duration:%s' % bucket_value(timer.duration, (5, 10, 20), 's'), 'case_count:%s' % bucket_value(len(result.cases), (2, 5, 10)), 'ledger_count:%s' % bucket_value(len(result.ledgers), (2, 5, 10)), ] datadog_counter('commcare.xform_submissions.count', tags=tags)
def _commit_timing(queryset): # only send to datadog on initial query evaluation commit = queryset._mptt_set._result_cache is None try: yield finally: if commit and queryset._mptt_set._result_cache is not None: timing = queryset._timing for key in timing.timers: bucket = bucket_value(timing.duration(key), TIME_BUCKETS, "s") datadog_counter( 'commcare.locations.%s.%s.count' % (timing.name, key), tags=['duration:%s' % bucket], )
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids with timer: for doc in doc_store.iter_documents(indicator_by_doc_id.keys()): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove = _save_document_helper( indicator, doc) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) num_processed = len(processed_indicators) num_failed = len(failed_indicators) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() datadog_counter('commcare.async_indicator.processed_success', num_processed) datadog_counter('commcare.async_indicator.processed_fail', num_failed) datadog_histogram('commcare.async_indicator.processing_time', timer.duration, tags=[u'config_ids:{}'.format(indicator_config_ids)])
def report_and_fail_on_shard_failures(search_result): """ Raise an ESShardFailure if there are shard failures in an ES search result (JSON) and report to datadog. The commcare.es.partial_results metric counts 1 per ES request with any shard failure. """ if not isinstance(search_result, dict): return if search_result.get('_shards', {}).get('failed'): datadog_counter('commcare.es.partial_results', value=1) # Example message: # "_shards: {'successful': 4, 'failed': 1, 'total': 5}" raise ESShardFailure('_shards: {!r}'.format(search_result.get('_shards')))
def celery_record_time_to_start(task_id=None, task=None, **kwargs): from corehq.util.datadog.gauges import datadog_gauge, datadog_counter tags = [ 'celery_task_name:{}'.format(task.name), 'celery_queue:{}'.format(task.queue), ] timer = TimeToStartTimer(task_id) try: time_to_start = timer.stop_and_pop_timing() except TimingNotAvailable: datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags) else: datadog_gauge('commcare.celery.task.time_to_start', time_to_start.total_seconds(), tags=tags)
def bulk_delete(self, paths): success = True for chunk in chunked(paths, 1000): objects = [{"Key": path} for path in chunk] s3_bucket = self._s3_bucket() deleted_bytes = 0 for path in chunk: with maybe_not_found(): deleted_bytes += s3_bucket.Object(path).content_length resp = s3_bucket.delete_objects(Delete={"Objects": objects}) deleted = set(d["Key"] for d in resp.get("Deleted", [])) success = success and all(o["Key"] in deleted for o in objects) datadog_counter('commcare.blobs.deleted.count', value=len(deleted)) datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes) return success
def bulk_delete(self, paths): success = True deleted_count = 0 deleted_bytes = 0 for path in paths: if not exists(path): success = False else: cs = _count_size(path) deleted_count += cs.count deleted_bytes += cs.size os.remove(path) datadog_counter('commcare.blobs.deleted.count', value=deleted_count) datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes) return success
def should_capture(self, exc_info): ex_value = exc_info[1] capture = getattr(ex_value, 'sentry_capture', True) if not capture: return False if not super(HQSentryClient, self).should_capture(exc_info): return False rate_limit_key = _get_rate_limit_key(exc_info) if rate_limit_key: datadog_counter('commcare.sentry.errors.rate_limited', tags=['service:{}'.format(rate_limit_key)]) return not _is_rate_limited(rate_limit_key) return True
def _record_transient_bounce(self, aws_meta, uid): exists = TransientBounceEmail.objects.filter( email=aws_meta.email, timestamp=aws_meta.timestamp, ).exists() if not exists: TransientBounceEmail.objects.create( email=aws_meta.email, timestamp=aws_meta.timestamp, headers=aws_meta.headers, ) if self.delete_processed_messages: self._delete_message_with_uid(uid) datadog_counter( 'commcare.bounced_email_manager.transient_bounce_recorded')
def _delay_and_report_rate_limit_submission(domain, max_wait, datadog_metric): with TimingContext() as timer: acquired = submission_rate_limiter.wait(domain, timeout=max_wait) if acquired: duration_tag = bucket_value(timer.duration, [.5, 1, 5, 10, 15], unit='s') elif timer.duration < max_wait: duration_tag = 'quick_reject' else: duration_tag = 'delayed_reject' datadog_counter(datadog_metric, tags=[ f'domain:{domain}', f'duration:{duration_tag}', f'throttle_method:{"delay" if acquired else "reject"}' ]) return acquired
def delete(self, *args, **kw): identifier, bucket = self.get_args_for_delete(*args, **kw) if identifier is None: path = safejoin(self.rootdir, bucket) remove = shutil.rmtree else: path = self.get_path(identifier, bucket) remove = os.remove if not exists(path): return False cs = _count_size(path) datadog_counter('commcare.blobs.deleted.count', value=cs.count) datadog_counter('commcare.blobs.deleted.bytes', value=cs.size) remove(path) return True
def __record_change_metric_in_datadog(self, metric, change, timer=None): if change.metadata is not None: tags = [ u'datasource:{}'.format(change.metadata.data_source_name), u'document_type:{}'.format(change.metadata.document_type), u'domain:{}'.format(change.metadata.domain), u'is_deletion:{}'.format(change.metadata.is_deletion), u'pillow_name:{}'.format(self.get_name()) ] datadog_counter(metric, tags=tags) if timer: datadog_gauge('commcare.change_feed.processing_time', timer.duration, tags=tags)
def bulk_delete(self, metas): """Delete blob metadata in bulk :param metas: A list of `BlobMeta` objects. """ if any(meta.id is None for meta in metas): raise ValueError("cannot delete unsaved BlobMeta") parents = defaultdict(list) for meta in metas: parents[meta.parent_id].append(meta.id) for db_name, split_parent_ids in split_list_by_db_partition(parents): ids = chain.from_iterable(parents[x] for x in split_parent_ids) BlobMeta.objects.using(db_name).filter(id__in=list(ids)).delete() deleted_bytes = sum(meta.content_length for m in metas) datadog_counter('commcare.blobs.deleted.count', value=len(metas)) datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
def celery_record_time_to_start(task_id=None, task=None, **kwargs): from corehq.util.datadog.gauges import datadog_gauge, datadog_counter time_sent = cache.get('task.{}.time_sent'.format(task_id)) tags = [ 'celery_task_name:{}'.format(task.name), 'celery_queue:{}'.format(task.queue), ] if time_sent: time_to_start = (datetime.datetime.utcnow() - time_sent).total_seconds() datadog_gauge('commcare.celery.task.time_to_start', time_to_start, tags=tags) else: datadog_counter('commcare.celery.task.time_to_start_unavailable', tags=tags)
def bulk_delete(self, metas): """Delete blob metadata in bulk :param metas: A list of `BlobMeta` objects. """ if any(meta.id is None for meta in metas): raise ValueError("cannot delete unsaved BlobMeta") delete_blobs_sql = """ WITH deleted AS ( DELETE FROM blobs_blobmeta WHERE id IN %s RETURNING * ), ins AS ( INSERT INTO blobs_deletedblobmeta ( "id", "domain", "parent_id", "name", "key", "type_code", "created_on", "deleted_on" ) SELECT "id", "domain", "parent_id", "name", "key", "type_code", "created_on", %s AS "deleted_on" FROM deleted WHERE expires_on IS NULL ) SELECT COUNT(*) FROM deleted; """ now = _utcnow() parents = defaultdict(list) for meta in metas: parents[meta.parent_id].append(meta.id) for dbname, split_parent_ids in split_list_by_db_partition(parents): ids = tuple(m for p in split_parent_ids for m in parents[p]) with connections[dbname].cursor() as cursor: cursor.execute(delete_blobs_sql, [ids, now]) deleted_bytes = sum(meta.content_length for m in metas) datadog_counter('commcare.blobs.deleted.count', value=len(metas)) datadog_counter('commcare.blobs.deleted.bytes', value=deleted_bytes)
def put(self, meta): """Save `BlobMeta` in the metadata database""" meta.save() length = meta.content_length datadog_counter('commcare.blobs.added.count') datadog_counter('commcare.blobs.added.bytes', value=length) if meta.expires_on is not None: datadog_counter('commcare.temp_blobs.count') datadog_counter('commcare.temp_blobs.bytes_added', value=length)
def handle_exception(exception, config_id, doc, adapter): metric = None if isinstance(exception, (ProtocolError, ReadTimeout)): metric = 'commcare.async_indicator.riak_error' elif isinstance(exception, (ESError, ConnectionTimeout)): # a database had an issue so log it and go on to the next document metric = 'commcare.async_indicator.es_error' elif isinstance(exception, (DatabaseError, InternalError)): # a database had an issue so log it and go on to the next document metric = 'commcare.async_indicator.psql_error' else: # getting the config could fail before the adapter is set if adapter: adapter.handle_exception(doc, exception) if metric: datadog_counter(metric, 1, tags={'config_id': config_id, 'doc_id': doc['_id']})
def should_capture(self, exc_info): ex_value = exc_info[1] capture = getattr(ex_value, 'sentry_capture', True) if not capture: return False if not super(HQSentryClient, self).should_capture(exc_info): return False rate_limit_key = _get_rate_limit_key(exc_info) if rate_limit_key: datadog_counter('commcare.sentry.errors.rate_limited', tags=[ 'service:{}'.format(rate_limit_key) ]) exponential_backoff_key = '{}_down'.format(rate_limit_key) return not is_rate_limited(exponential_backoff_key) return True
def remove_from_queue(queued_sms): with transaction.atomic(): sms = SMS() for field in sms._meta.fields: if field.name != 'id': setattr(sms, field.name, getattr(queued_sms, field.name)) queued_sms.delete() sms.save() sms.publish_change() if sms.direction == OUTGOING and sms.processed and not sms.error: create_billable_for_sms(sms) datadog_counter('commcare.sms.outbound_succeeded') elif sms.direction == OUTGOING: datadog_counter('commcare.sms.outbound_failed') elif sms.direction == INCOMING and sms.domain and domain_has_privilege(sms.domain, privileges.INBOUND_SMS): create_billable_for_sms(sms)
def __record_change_metric_in_datadog(self, metric, change, processor=None, processing_time=None): if change.metadata is not None: tags = [ 'datasource:{}'.format(change.metadata.data_source_name), 'is_deletion:{}'.format(change.metadata.is_deletion), 'pillow_name:{}'.format(self.get_name()), 'processor:{}'.format(processor.__class__.__name__ if processor else "all_processors"), ] count = 1 if processor else len(self.processors) datadog_counter(metric, value=count, tags=tags) change_lag = (datetime.utcnow() - change.metadata.publish_timestamp).total_seconds() datadog_gauge('commcare.change_feed.change_lag', change_lag, tags=[ 'pillow_name:{}'.format(self.get_name()), _topic_for_ddog(change.topic), ]) if processing_time: datadog_histogram('commcare.change_feed.processing_time', processing_time, tags=tags)
def handle_pillow_error(pillow, change, exception): from pillow_retry.models import PillowError pillow_logging.exception("[%s] Error on change: %s, %s" % ( pillow.get_name(), change['id'], exception, )) datadog_counter('commcare.change_feed.changes.exceptions', tags=[ 'pillow_name:{}'.format(pillow.get_name()), ]) # keep track of error attempt count change.increment_attempt_count() # always retry document missing errors, because the error is likely with couch if pillow.retry_errors or isinstance(exception, DocumentMissingError): error = PillowError.get_or_create(change, pillow) error.add_attempt(exception, sys.exc_info()[2], change.metadata) error.save()
def _submission_error(request, message, count_metric, metric_tags, domain, app_id, user_id, authenticated, meta=None, status=400, notify=True): """Notify exception, datadog count, record metrics, construct response :param status: HTTP status code (default: 400). :returns: HTTP response object """ details = [ "domain:{}".format(domain), "app_id:{}".format(app_id), "user_id:{}".format(user_id), "authenticated:{}".format(authenticated), "form_meta:{}".format(meta or {}), ] datadog_counter(count_metric, tags=details) if notify: notify_exception(request, message, details) response = HttpResponseBadRequest( message, status=status, content_type="text/plain") _record_metrics(metric_tags, 'unknown', response) return response