def __init__(self, project=None, restore_user=None, params=None, cache_settings=None, is_async=False, case_sync=None): assert isinstance(restore_user, OTARestoreUser) self.project = project self.domain = project.name if project else '' self.restore_user = restore_user self.params = params or RestoreParams() self.cache_settings = cache_settings or RestoreCacheSettings() self.is_async = is_async self.restore_state = RestoreState( self.project, self.restore_user, self.params, is_async, self.cache_settings.overwrite_cache, case_sync=case_sync, ) self.force_cache = self.cache_settings.force_cache self.cache_timeout = self.cache_settings.cache_timeout self.overwrite_cache = self.cache_settings.overwrite_cache self.timing_context = TimingContext('restore-{}-{}'.format( self.domain, self.restore_user.username))
def metrics_histogram_timer(metric: str, timing_buckets: Iterable[int], tags: Dict[str, str] = None, bucket_tag: str = 'duration', callback: Callable = None): """ Create a context manager that times and reports to the metric providers as a histogram Example Usage: :: timer = metrics_histogram_timer('commcare.some.special.metric', tags={ 'type': type, ], timing_buckets=(.001, .01, .1, 1, 10, 100)) with timer: some_special_thing() This will result it a call to `metrics_histogram` with the timer value. Note: Histograms are implemented differently by each provider. See documentation for details. :param metric: Name of the metric (must start with 'commcare.') :param tags: metric tags to include :param timing_buckets: sequence of numbers representing time thresholds, in seconds :param bucket_tag: The name of the bucket tag to use (if used by the underlying provider) :param callback: a callable which will be called when exiting the context manager with a single argument of the timer duratio :return: A context manager that will perform the specified timing and send the specified metric """ timer = TimingContext() original_stop = timer.stop def new_stop(name=None): original_stop(name) if callback: callback(timer.duration) metrics_histogram(metric, timer.duration, bucket_tag=bucket_tag, buckets=timing_buckets, bucket_unit='s', tags=tags) timer_name = metric if metric.startswith('commcare.'): timer_name = ".".join( metric.split('.')[1:]) # remove the 'commcare.' prefix add_breadcrumb( category="timing", message=f"{timer_name}: {timer.duration:0.3f}", level="info", ) timer.stop = new_stop return timer
def process_with_error_handling(self, change, processor=None): # process given change on all serial processors or given processor. # Tracks success/fail in datadog but not the timer metric, caller updates that timer = TimingContext() is_success = False try: with timer: if processor: processor.process_change(change) else: # process on serial processors self.process_change(change, serial_only=True) is_success = True except Exception as ex: try: handle_pillow_error(self, change, ex) except Exception as e: notify_exception( None, 'processor error in pillow {} {}'.format( self.get_name(), e, )) raise if is_success: self._record_change_success_in_datadog(change) return timer.duration
def slow_function_view(request): timer = TimingContext() with timer("sleep"): time.sleep(0.2) response = HttpResponse() response.request_timer = timer return response
def get_context_data(self, **kwargs): context = super(AdminRestoreView, self).get_context_data(**kwargs) response, timing_context = self._get_restore_response() timing_context = timing_context or TimingContext(self.user.username) if isinstance(response, StreamingHttpResponse): string_payload = b''.join(response.streaming_content) xml_payload = etree.fromstring(string_payload) context.update(self.get_stats_from_xml(xml_payload)) else: if response.status_code in (401, 404): # corehq.apps.ota.views.get_restore_response couldn't find user or user didn't have perms xml_payload = E.error(response.content) elif response.status_code == 412: # RestoreConfig.get_response returned HttpResponse 412. Response content is already XML xml_payload = etree.fromstring(response.content) else: message = _( 'Unexpected restore response {}: {}. ' 'If you believe this is a bug please report an issue.' ).format(response.status_code, response.content.decode('utf-8')) xml_payload = E.error(message) formatted_payload = etree.tostring(xml_payload, pretty_print=True, encoding='utf-8').decode('utf-8') hide_xml = self.request.GET.get('hide_xml') == 'true' context.update({ 'payload': formatted_payload, 'status_code': response.status_code, 'timing_data': timing_context.to_list(), 'hide_xml': hide_xml, }) return context
def process_changes(self, since, forever): """ Process changes from the changes stream. """ context = PillowRuntimeContext(changes_seen=0) try: for change in self.get_change_feed().iter_changes(since=since or None, forever=forever): if change: timer = TimingContext() try: context.changes_seen += 1 with timer: self.process_with_error_handling(change) except Exception as e: notify_exception(None, u'processor error in pillow {} {}'.format( self.get_name(), e, )) self._record_change_exception_in_datadog(change) raise else: updated = self.fire_change_processed_event(change, context) if updated: self._record_checkpoint_in_datadog() self._record_change_success_in_datadog(change) self._record_change_in_datadog(change, timer) else: updated = self.checkpoint.touch(min_interval=CHECKPOINT_MIN_WAIT) if updated: self._record_checkpoint_in_datadog() except PillowtopCheckpointReset: self.process_changes(since=self.get_last_checkpoint_sequence(), forever=forever)
def process_xforms_for_cases(xforms, case_db, timing_context=None): from casexml.apps.case.xform import process_cases_with_casedb from corehq.apps.commtrack.processing import process_stock timing_context = timing_context or TimingContext() instance = xforms[0] with timing_context("process_cases"): case_result = process_cases_with_casedb(xforms, case_db) with timing_context("process_ledgers"): stock_result = process_stock(xforms, case_db) stock_result.populate_models() modified_on_date = instance.received_on if getattr(instance, 'edited_on', None) and instance.edited_on > instance.received_on: modified_on_date = instance.edited_on with timing_context("check_cases_before_save"): cases = case_db.get_cases_for_saving(modified_on_date) return CaseStockProcessingResult( case_result=case_result, case_models=cases, stock_result=stock_result, )
def __init__(self, instance=None, attachments=None, auth_context=None, domain=None, app_id=None, build_id=None, path=None, location=None, submit_ip=None, openrosa_headers=None, last_sync_token=None, received_on=None, date_header=None, partial_submission=False, case_db=None, force_logs=False, timing_context=None): assert domain, "'domain' is required" assert instance, instance assert not isinstance(instance, HttpRequest), instance self.domain = domain self.app_id = app_id self.build_id = build_id # get_location has good default self.location = location or couchforms.get_location() self.received_on = received_on self.date_header = date_header self.submit_ip = submit_ip self.last_sync_token = last_sync_token self.openrosa_headers = openrosa_headers or {} self.instance = instance self.attachments = attachments or {} self.auth_context = auth_context or DefaultAuthContext() self.path = path self.interface = FormProcessorInterface(domain) self.partial_submission = partial_submission # always None except in the case where a system form is being processed as part of another submission # e.g. for closing extension cases self.case_db = case_db if case_db: assert case_db.domain == domain self.force_logs = force_logs self.is_openrosa_version3 = self.openrosa_headers.get(OPENROSA_VERSION_HEADER, '') == OPENROSA_VERSION_3 self.track_load = form_load_counter("form_submission", domain) self.timing_context = timing_context or TimingContext()
def get_context_data(self, **kwargs): context = super(AdminRestoreView, self).get_context_data(**kwargs) response, timing_context = self._get_restore_response() timing_context = timing_context or TimingContext(self.user.username) string_payload = ''.join(response.streaming_content) xml_payload = etree.fromstring(string_payload) restore_id_element = xml_payload.find( '{{{0}}}Sync/{{{0}}}restore_id'.format(SYNC_XMLNS)) num_cases = len( xml_payload.findall( '{http://commcarehq.org/case/transaction/v2}case')) formatted_payload = etree.tostring(xml_payload, pretty_print=True) context.update({ 'payload': formatted_payload, 'restore_id': restore_id_element.text if restore_id_element is not None else None, 'status_code': response.status_code, 'timing_data': timing_context.to_list(), 'num_cases': num_cases, }) return context
def migration_restore(request, domain, case_id): """Restore endpoint used in bulk case migrations Accepts the provided case_id and returns a restore for the user containing: * Registration block * The passed in case and its full network of cases """ domain_obj = Domain.get_by_name(domain) restore_user = request.couch_user restore_params = RestoreParams(device_id="case_migration", version=V2) restore_state = RestoreState(domain_obj, restore_user.to_ota_restore_user(domain), restore_params) restore_state.start_sync() timing_context = TimingContext('migration-restore-{}-{}'.format( domain, restore_user.username)) case_ids = get_related_case_ids(domain, case_id) with RestoreContent(restore_user.username) as content: content.append(get_registration_element(restore_user)) sync_payload = CleanOwnerSyncPayload(timing_context, case_ids, restore_state) sync_payload.extend_response(content) payload = content.get_fileobj() restore_state.finish_sync() return RestoreResponse(payload).get_http_response()
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids related_docs_to_rebuild = set() with timer: for doc in doc_store.iter_documents(indicator_by_doc_id.keys()): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove, rebuild_related_docs = _save_document_helper( indicator, doc) if rebuild_related_docs: related_docs_to_rebuild = related_docs_to_rebuild.union( icds_get_related_docs_ids(doc['_id'])) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) num_processed = len(processed_indicators) num_failed = len(failed_indicators) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() # remove any related docs that were just rebuilt related_docs_to_rebuild = related_docs_to_rebuild - set(doc_ids) # queue the docs that aren't already queued _queue_indicators( AsyncIndicator.objects.filter(doc_id__in=related_docs_to_rebuild, date_queued=None)) datadog_counter('commcare.async_indicator.processed_success', num_processed) datadog_counter('commcare.async_indicator.processed_fail', num_failed) datadog_histogram('commcare.async_indicator.processing_time', timer.duration, tags=[u'config_ids:{}'.format(indicator_config_ids)])
def get_context_data(self, **kwargs): context = super(AdminRestoreView, self).get_context_data(**kwargs) response, timing_context = self._get_restore_response() timing_context = timing_context or TimingContext(self.user.username) if isinstance(response, StreamingHttpResponse): string_payload = ''.join(response.streaming_content) xml_payload = etree.fromstring(string_payload) restore_id_element = xml_payload.find('{{{0}}}Sync/{{{0}}}restore_id'.format(SYNC_XMLNS)) num_cases = len(xml_payload.findall('{http://commcarehq.org/case/transaction/v2}case')) else: if response.status_code in (401, 404): # corehq.apps.ota.views.get_restore_response couldn't find user or user didn't have perms xml_payload = E.error(response.content) elif response.status_code == 412: # RestoreConfig.get_response returned HttpResponse 412. Response content is already XML xml_payload = etree.fromstring(response.content) else: message = _('Unexpected restore response {}: {}. ' 'If you believe this is a bug please report an issue.').format(response.status_code, response.content) xml_payload = E.error(message) restore_id_element = None num_cases = 0 formatted_payload = etree.tostring(xml_payload, pretty_print=True) context.update({ 'payload': formatted_payload, 'restore_id': restore_id_element.text if restore_id_element is not None else None, 'status_code': response.status_code, 'timing_data': timing_context.to_list(), 'num_cases': num_cases, }) return context
def submit_case_blocks(self, caseblocks): timer = None try: with TimingContext() as timer: return super().submit_case_blocks(caseblocks) finally: if timer: self._last_submission_duration = timer.duration
def do_import(self, spreadsheet): with TimingContext() as timer: results = super().do_import(spreadsheet) try: self._report_import_timings(timer, results) except Exception: notify_exception(None, "Error reporting case import timings") finally: return results
def run_checks(checks_to_do): greenlets = [] with TimingContext() as timer: for check_name in checks_to_do: if check_name not in CHECKS: raise UnknownCheckException(check_name) greenlets.append(gevent.spawn(_run_check, check_name, timer)) gevent.joinall(greenlets) return [greenlet.value for greenlet in greenlets]
def dispatch(self, request_type, request, **kwargs): if not toggles.ODATA.enabled_for_request(request): raise ImmediateHttpResponse( response=HttpResponseNotFound('Feature flag not enabled.')) self.config_id = kwargs['config_id'] with TimingContext() as timer: response = super(ODataFormResource, self).dispatch(request_type, request, **kwargs) record_feed_access_in_datadog(request, self.config_id, timer.duration, response) return response
def dispatch(self, request_type, request, **kwargs): if not BI_INTEGRATION_PREVIEW.enabled_for_request(request): raise ImmediateHttpResponse( response=HttpResponseNotFound('Feature flag not enabled.')) self.config_id = kwargs['config_id'] self.table_id = int(kwargs.get('table_id', 0)) with TimingContext() as timer: response = super(ODataCaseResource, self).dispatch(request_type, request, **kwargs) record_feed_access_in_datadog(request, self.config_id, timer.duration, response) return response
def dispatch(self, request_type, request, **kwargs): if not domain_has_privilege(request.domain, privileges.ODATA_FEED): raise ImmediateHttpResponse( response=HttpResponseNotFound('Feature flag not enabled.')) self.config_id = kwargs['config_id'] self.table_id = int(kwargs.get('table_id', 0)) with TimingContext() as timer: response = super(BaseODataResource, self).dispatch(request_type, request, **kwargs) record_feed_access_in_datadog(request, self.config_id, timer.duration, response) return response
def datadog_bucket_timer(metric, tags, timing_buckets, callback=None): """ create a context manager that times and reports to datadog using timing buckets adds a 'duration' tag specifying which predefined timing bucket the timing fell into, see the `bucket_value` function for more info. Example Usage: timer = datadog_bucket_timer('commcare.some.special.metric', tags=[ 'type:{}'.format(type), ], timing_buckets=(.001, .01, .1, 1, 10, 100)) with timer: some_special_thing() This will result it a datadog counter metric with a 'duration' tag, with the possible values lt_0.001, lt_0.01, lt_0.1, lt_001, lt_010, lt_100, and over_100. :param metric: Name of the datadog metric (must start with 'commcare.') :param tags: Datadog tags to include :param timing_buckets: sequence of numbers representing time thresholds, in seconds :return: A context manager that will perform the specified timing and send the specified metric """ timer = TimingContext() original_stop = timer.stop def new_stop(name=None): original_stop(name) if callback: callback(timer.duration) datadog_counter(metric, tags=list(tags) + [ 'duration:%s' % bucket_value(timer.duration, timing_buckets, 's') ]) timer.stop = new_stop return timer
def _batch_process_with_error_handling(self, changes_chunk): """ Process given chunk in batch mode first on batch-processors and only latter on serial processors one by one, so that docs can be fetched only once while batch processing in bulk and cached via change.get_document for serial processors. For the relookup to be avoided at least one batch processor under should use change.set_document after docs are fetched in bulk If there is an exception in chunked processing, falls back to serial processing. """ processing_time = 0 def reprocess_serially(chunk, processor): for change in chunk: self.process_with_error_handling(change, processor) for processor in self.batch_processors: if not changes_chunk: return set(), 0 changes_chunk = self._deduplicate_changes(changes_chunk) retry_changes = set() timer = TimingContext() with timer: try: retry_changes, change_exceptions = processor.process_changes_chunk( changes_chunk) except Exception as ex: notify_exception( None, "{pillow_name} Error in processing changes chunk {change_ids}: {ex}" .format(pillow_name=self.get_name(), change_ids=[c.id for c in changes_chunk], ex=ex)) self._record_batch_exception_in_datadog(processor) for change in set(changes_chunk) - set(retry_changes): self._record_change_success_in_datadog( change, processor) # fall back to processing one by one reprocess_serially(changes_chunk, processor) else: # fall back to processing one by one for failed changes for change, exception in change_exceptions: handle_pillow_error(self, change, exception) reprocess_serially(retry_changes, processor) processing_time += timer.duration # process on serial_processors for change in changes_chunk: processing_time += self.process_with_error_handling(change) self._record_datadog_metrics(changes_chunk, processing_time)
def rate_limit_submission_by_delaying(domain, max_wait): if not submission_rate_limiter.allow_usage(domain): with TimingContext() as timer: acquired = submission_rate_limiter.wait(domain, timeout=max_wait) if acquired: duration_tag = bucket_value(timer.duration, [1, 5, 10, 15, 20], unit='s') else: duration_tag = 'timeout' datadog_counter('commcare.xform_submissions.rate_limited.test', tags=[ 'domain:{}'.format(domain), 'duration:{}'.format(duration_tag) ]) submission_rate_limiter.report_usage(domain)
def custom_data(self, selected_location, domain): timer = TimingContext() with timer: to_ret = self._custom_data(selected_location, domain) if selected_location: loc_type = selected_location.location_type.name else: loc_type = None datadog_histogram("commcare.icds.block_reports.custom_data_time", timer.duration, tags="location_type:{}, report_slug:{}".format( loc_type, self.slug)) return to_ret
def datadog_bucket_timer(metric, tags, timing_buckets, callback=None): """ create a context manager that times and reports to datadog using timing buckets adds a 'duration' tag specifying which predefined timing bucket the timing fell into, see the `bucket_value` function for more info. Example Usage: timer = datadog_bucket_timer('commcare.some.special.metric', tags=[ 'type:{}'.format(type), ], timing_buckets=(.001, .01, .1, 1, 10, 100)) with timer: some_special_thing() This will result it a datadog counter metric with a 'duration' tag, with the possible values lt_0.001, lt_0.01, lt_0.1, lt_001, lt_010, lt_100, and over_100. :param metric: Name of the datadog metric (must start with 'commcare.') :param tags: Datadog tags to include :param timing_buckets: sequence of numbers representing time thresholds, in seconds :return: A context manager that will perform the specified timing and send the specified metric """ timer = TimingContext() original_stop = timer.stop def new_stop(name=None): original_stop(name) if callback: callback(timer.duration) datadog_counter( metric, tags=list(tags) + ['duration:%s' % bucket_value(timer.duration, timing_buckets, 's')] ) timer.stop = new_stop return timer
def _delay_and_report_rate_limit_submission(domain, max_wait, datadog_metric): with TimingContext() as timer: acquired = submission_rate_limiter.wait(domain, timeout=max_wait) if acquired: duration_tag = bucket_value(timer.duration, [.5, 1, 5, 10, 15], unit='s') elif timer.duration < max_wait: duration_tag = 'quick_reject' else: duration_tag = 'delayed_reject' datadog_counter(datadog_metric, tags=[ f'domain:{domain}', f'duration:{duration_tag}', f'throttle_method:{"delay" if acquired else "reject"}' ]) return acquired
def migrate(self): self.log_info('migrating domain {}'.format(self.domain)) with TimingContext("couch_sql_migration") as timing_context: self.timing_context = timing_context with timing_context('main_forms'): self._process_main_forms() with timing_context("unprocessed_forms"): self._copy_unprocessed_forms() with timing_context("unprocessed_cases"): self._copy_unprocessed_cases() with timing_context("case_diffs"): self._calculate_case_diffs() self._send_timings(timing_context) self.log_info('migrated domain {}'.format(self.domain))
def get_context_data(self, **kwargs): context = super(AdminRestoreView, self).get_context_data(**kwargs) response, timing_context = self._get_restore_response() timing_context = timing_context or TimingContext(self.user.username) if isinstance(response, StreamingHttpResponse): string_payload = b''.join(response.streaming_content) xml_payload = etree.fromstring(string_payload) restore_id_element = xml_payload.find('{{{0}}}Sync/{{{0}}}restore_id'.format(SYNC_XMLNS)) cases = xml_payload.findall('{http://commcarehq.org/case/transaction/v2}case') num_cases = len(cases) create_case_type = filter(None, [case.find( '{http://commcarehq.org/case/transaction/v2}create/' '{http://commcarehq.org/case/transaction/v2}case_type' ) for case in cases]) update_case_type = filter(None, [case.find(
def run_checks(checks_to_do): statuses = [] with TimingContext() as timer: for check_name in checks_to_do: if check_name not in CHECKS: raise UnknownCheckException(check_name) check_info = CHECKS[check_name] with timer(check_name): try: status = check_info['check_func']() except Exception as e: status = ServiceStatus(False, "{} raised an error".format(check_name), e) status.duration = timer.peek().duration statuses.append((check_name, status)) return statuses
def custom_data(self, selected_location, domain): timer = TimingContext() with timer: to_ret = self._custom_data(selected_location, domain) if selected_location: loc_type = selected_location.location_type.name else: loc_type = None tags = ["location_type:{}".format(loc_type), "report_slug:{}".format(self.slug)] if self.allow_conditional_agg: tags.append("allow_conditional_agg:yes") datadog_histogram( "commcare.icds.block_reports.custom_data_duration", timer.duration, tags=tags ) return to_ret
def process_with_error_handling(self, change, context): timer = TimingContext() try: with timer: self.process_change(change) except Exception as ex: try: handle_pillow_error(self, change, ex) except Exception as e: notify_exception(None, 'processor error in pillow {} {}'.format( self.get_name(), e, )) raise else: self._update_checkpoint(change, context) self._record_change_success_in_datadog(change) self._record_change_in_datadog(change, timer)
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids with timer: for doc in doc_store.iter_documents(doc_ids): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove = _save_document_helper(indicator, doc) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() datadog_histogram( 'commcare.async_indicator.processing_time', timer.duration, tags=[ u'config_ids:{}'.format(indicator_config_ids) ] )
def migrate(self): log.info('migrating domain {}'.format(self.domain)) log.info('run timestamp is {}'.format(self.run_timestamp)) self.processed_docs = 0 with TimingContext("couch_sql_migration") as timing_context: self.timing_context = timing_context with timing_context('main_forms'): self._process_main_forms() with timing_context("unprocessed_forms"): self._copy_unprocessed_forms() with timing_context("unprocessed_cases"): self._copy_unprocessed_cases() with timing_context("case_diffs"): self._calculate_case_diffs() self._send_timings(timing_context) log.info('migrated domain {}'.format(self.domain))
def __init__(self, project=None, restore_user=None, params=None, cache_settings=None, is_async=False, case_sync=None): assert isinstance(restore_user, OTARestoreUser) self.project = project self.domain = project.name if project else '' self.restore_user = restore_user self.params = params or RestoreParams() self.cache_settings = cache_settings or RestoreCacheSettings() self.is_async = is_async self.restore_state = RestoreState( self.project, self.restore_user, self.params, is_async, self.cache_settings.overwrite_cache, case_sync=case_sync, ) self.force_cache = self.cache_settings.force_cache self.cache_timeout = self.cache_settings.cache_timeout self.overwrite_cache = self.cache_settings.overwrite_cache self.timing_context = TimingContext('restore-{}-{}'.format(self.domain, self.restore_user.username))
class RestoreConfig(object): """ A collection of attributes associated with an OTA restore :param project: The domain object. An instance of `Domain`. :param restore_user: The restore user requesting the restore :param params: The RestoreParams associated with this (see above). :param cache_settings: The RestoreCacheSettings associated with this (see above). :param is_async: Whether to get the restore response using a celery task :param case_sync: Case sync algorithm (None -> default). """ def __init__(self, project=None, restore_user=None, params=None, cache_settings=None, is_async=False, case_sync=None): assert isinstance(restore_user, OTARestoreUser) self.project = project self.domain = project.name if project else '' self.restore_user = restore_user self.params = params or RestoreParams() self.cache_settings = cache_settings or RestoreCacheSettings() self.is_async = is_async self.restore_state = RestoreState( self.project, self.restore_user, self.params, is_async, self.cache_settings.overwrite_cache, case_sync=case_sync, ) self.force_cache = self.cache_settings.force_cache self.cache_timeout = self.cache_settings.cache_timeout self.overwrite_cache = self.cache_settings.overwrite_cache self.timing_context = TimingContext('restore-{}-{}'.format(self.domain, self.restore_user.username)) @property @memoized def sync_log(self): return self.restore_state.last_sync_log @property def async_restore_task_id_cache(self): return AsyncRestoreTaskIdCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id=self.sync_log._id if self.sync_log else '', device_id=self.params.device_id, ) @property def restore_payload_path_cache(self): return RestorePayloadPathCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id=self.sync_log._id if self.sync_log else '', device_id=self.params.device_id, ) @property def initial_restore_payload_path_cache(self): return RestorePayloadPathCache( domain=self.domain, user_id=self.restore_user.user_id, sync_log_id='', device_id=self.params.device_id, ) def get_response(self): is_async = self.is_async try: with self.timing_context: payload = self.get_payload() response = payload.get_http_response() except RestoreException as e: logging.exception("%s error during restore submitted by %s: %s" % (type(e).__name__, self.restore_user.username, str(e))) is_async = False response = get_simple_response_xml( e.message, ResponseNature.OTA_RESTORE_ERROR ) response = HttpResponse(response, content_type="text/xml; charset=utf-8", status=412) # precondition failed if not is_async: self._record_timing(response.status_code) return response def get_payload(self): self.validate() self.delete_initial_cached_payload_if_necessary() self.delete_cached_payload_if_necessary() cached_response = self.get_cached_response() tags = [ 'domain:{}'.format(self.domain), 'is_async:{}'.format(bool(self.is_async)), ] if cached_response: datadog_counter('commcare.restores.cache_hits.count', tags=tags) return cached_response datadog_counter('commcare.restores.cache_misses.count', tags=tags) # Start new sync if self.is_async: response = self._get_asynchronous_payload() else: response = self.generate_payload() return response def validate(self): try: self.restore_state.validate_state() except InvalidSyncLogException as e: # This exception will get caught by the view and a 412 will be returned to the phone for resync raise RestoreException(e) def get_cached_response(self): if self.overwrite_cache: return None cache_payload_path = self.restore_payload_path_cache.get_value() return CachedResponse(cache_payload_path) def generate_payload(self, async_task=None): if async_task: self.timing_context.stop("wait_for_task_to_start") self.restore_state.start_sync() fileobj = self._generate_restore_response(async_task=async_task) try: self.restore_state.finish_sync() cached_response = self.set_cached_payload_if_necessary( fileobj, self.restore_state.duration, async_task) if async_task: fileobj.close() assert cached_response is not None response = cached_response self.timing_context.stop() # root timer self._record_timing('async') else: fileobj.seek(0) response = RestoreResponse(fileobj) except: fileobj.close() raise return response def _get_asynchronous_payload(self): new_task = False # fetch the task from celery task_id = self.async_restore_task_id_cache.get_value() task = AsyncResult(task_id) task_exists = task.status == ASYNC_RESTORE_SENT if not task_exists: # start a new task # NOTE this starts a nested timer (wait_for_task_to_start), # which will be stopped by self.generate_payload(async_task) # in the asynchronous task. It is expected that # get_async_restore_payload.delay(self) will serialize this # RestoreConfig and it's associated TimingContext before it # returns, and thereby fork the timing context. The timing # context associated with this side of the fork will not be # recorded since it is async (see self.get_response). with self.timing_context("wait_for_task_to_start"): task = get_async_restore_payload.delay(self, self.domain, self.restore_user.username) new_task = True # store the task id in cache self.async_restore_task_id_cache.set_value(task.id) try: response = task.get(timeout=self._get_task_timeout(new_task)) except TimeoutError: # return a 202 with progress response = AsyncRestoreResponse(task, self.restore_user.username) return response def _get_task_timeout(self, new_task): # if this is a new task, wait for INITIAL_ASYNC_TIMEOUT in case # this restore completes quickly. otherwise, only wait 1 second for # a response. return INITIAL_ASYNC_TIMEOUT_THRESHOLD if new_task else 1 def _generate_restore_response(self, async_task=None): """ :returns: A file-like object containing response content. """ username = self.restore_user.username count_items = self.params.include_item_count with RestoreContent(username, count_items) as content: for provider in get_element_providers(self.timing_context): with self.timing_context(provider.__class__.__name__): content.extend(provider.get_elements(self.restore_state)) for provider in get_async_providers(self.timing_context, async_task): with self.timing_context(provider.__class__.__name__): provider.extend_response(self.restore_state, content) return content.get_fileobj() def set_cached_payload_if_necessary(self, fileobj, duration, is_async): # only cache if the duration was longer than the threshold is_long_restore = duration > timedelta(seconds=INITIAL_SYNC_CACHE_THRESHOLD) if is_async or self.force_cache or is_long_restore: type_ = 'unknown' if is_async: type_ = 'async' elif self.force_cache: type_ = 'force' elif is_long_restore: type_ = 'long' tags = { 'type:{}'.format(type_), } datadog_counter('commcare.restores.cache_writes', tags=tags) response = CachedResponse.save_for_later( fileobj, self.cache_timeout, self.domain, self.restore_user.user_id, ) self.restore_payload_path_cache.set_value(response.name, self.cache_timeout) return response return None def delete_cached_payload_if_necessary(self): if self.overwrite_cache and self.restore_payload_path_cache.get_value(): self.restore_payload_path_cache.invalidate() def delete_initial_cached_payload_if_necessary(self): if self.sync_log: # Restores are usually cached by there sync token # but initial restores don't have a sync token, # so they're indistinguishable from each other. # Once a user syncs with a sync token, we're sure their initial sync is stale, # so delete it to avoid a stale payload if they (say) wipe the phone and sync again self.initial_restore_payload_path_cache.invalidate() def _record_timing(self, status): timing = self.timing_context assert timing.is_finished() duration = timing.duration if timing is not None else -1 device_id = self.params.device_id if duration > 20 or status == 412: if status == 412: # use last sync log since there is no current sync log sync_log_id = self.params.sync_log_id or 'N/A' else: sync_log = self.restore_state.current_sync_log sync_log_id = sync_log._id if sync_log else 'N/A' log = logging.getLogger(__name__) log.info( "restore %s: user=%s device=%s domain=%s status=%s duration=%.3f", sync_log_id, self.restore_user.username, device_id, self.domain, status, duration, ) is_webapps = device_id and device_id.startswith("WebAppsLogin") tags = [ 'status_code:{}'.format(status), 'device_type:{}'.format('webapps' if is_webapps else 'other'), ] env = settings.SERVER_ENVIRONMENT if (env, self.domain) in settings.RESTORE_TIMING_DOMAINS: tags.append('domain:{}'.format(self.domain)) if timing is not None: timer_buckets = (5, 20, 60, 120) for timer in timing.to_list(exclude_root=True): if timer.name in RESTORE_SEGMENTS: segment = RESTORE_SEGMENTS[timer.name] bucket = bucket_value(timer.duration, timer_buckets, 's') datadog_counter( 'commcare.restores.{}'.format(segment), tags=tags + ['duration:%s' % bucket], ) elif timer.name.startswith('fixture:'): bucket = bucket_value(timer.duration, timer_buckets, 's') datadog_counter( 'commcare.restores.fixture', tags=tags + [ 'duration:%s' % bucket, timer.name, ], ) tags.append('duration:%s' % bucket_value(timing.duration, timer_buckets, 's')) if settings.ENTERPRISE_MODE and self.params.app and self.params.app.copy_of: app_name = slugify(self.params.app.name) tags.append('app:{}-{}'.format(app_name, self.params.app.version)) datadog_counter('commcare.restores.count', tags=tags)