def handle(self, domain, case_type, data_source_ids, **options): configs = [] for data_source_id in data_source_ids: config, _ = get_datasource_config(data_source_id, domain) assert config.asynchronous assert config.referenced_doc_type == CASE_DOC_TYPE configs.append(config) fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': domain} doc_store = get_document_store(domain, CASE_DOC_TYPE) case_accessor = doc_store.case_accessors case_ids = case_accessor.get_case_ids_in_domain(type=case_type) num_case_ids = len(case_ids) print("inserting %d docs" % num_case_ids) for config in configs: adapter = get_indicator_adapter(config, can_handle_laboratory=True) adapter.build_table() # normally called after rebuilding finishes adapter.after_table_build() config_ids = [config._id for config in configs] for i, case_id in enumerate(case_ids): change = FakeChange(case_id, fake_change_doc) AsyncIndicator.update_indicators(change, config_ids) if i % 1000 == 0: print("inserted %d / %d docs" % (i, num_case_ids))
def _iteratively_build_table(config, last_id=None, resume_helper=None): resume_helper = resume_helper or DataSourceResumeHelper(config) indicator_config_id = config._id relevant_ids = [] document_store = get_document_store(config.domain, config.referenced_doc_type) for relevant_id in document_store.iter_document_ids(last_id): relevant_ids.append(relevant_id) if len(relevant_ids) >= ID_CHUNK_SIZE: resume_helper.set_ids_to_resume_from(relevant_ids) _build_indicators(config, document_store, relevant_ids, resume_helper) relevant_ids = [] if relevant_ids: resume_helper.set_ids_to_resume_from(relevant_ids) _build_indicators(config, document_store, relevant_ids, resume_helper) if not id_is_static(indicator_config_id): resume_helper.clear_ids() config.meta.build.finished = True try: config.save() except ResourceConflict: current_config = DataSourceConfiguration.get(config._id) # check that a new build has not yet started if config.meta.build.initiated == current_config.meta.build.initiated: current_config.meta.build.finished = True current_config.save()
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids related_docs_to_rebuild = set() with timer: for doc in doc_store.iter_documents(indicator_by_doc_id.keys()): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove, rebuild_related_docs = _save_document_helper( indicator, doc) if rebuild_related_docs: related_docs_to_rebuild = related_docs_to_rebuild.union( icds_get_related_docs_ids(doc['_id'])) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) num_processed = len(processed_indicators) num_failed = len(failed_indicators) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() # remove any related docs that were just rebuilt related_docs_to_rebuild = related_docs_to_rebuild - set(doc_ids) # queue the docs that aren't already queued _queue_indicators( AsyncIndicator.objects.filter(doc_id__in=related_docs_to_rebuild, date_queued=None)) datadog_counter('commcare.async_indicator.processed_success', num_processed) datadog_counter('commcare.async_indicator.processed_fail', num_failed) datadog_histogram('commcare.async_indicator.processing_time', timer.duration, tags=[u'config_ids:{}'.format(indicator_config_ids)])
def _iteratively_build_table(config, last_id=None, resume_helper=None): resume_helper = resume_helper or DataSourceResumeHelper(config) indicator_config_id = config._id relevant_ids = [] document_store = get_document_store(config.domain, config.referenced_doc_type) for relevant_id in document_store.iter_document_ids(last_id): relevant_ids.append(relevant_id) if len(relevant_ids) >= ID_CHUNK_SIZE: resume_helper.set_ids_to_resume_from(relevant_ids) _build_indicators(config, document_store, relevant_ids, resume_helper) relevant_ids = [] if relevant_ids: resume_helper.set_ids_to_resume_from(relevant_ids) _build_indicators(config, document_store, relevant_ids, resume_helper) if not id_is_static(indicator_config_id): resume_helper.clear_ids() config.meta.build.finished = True try: config.save() except ResourceConflict: current_config = DataSourceConfiguration.get(config._id) # check that a new build has not yet started if config.meta.build.initiated == current_config.meta.build.initiated: current_config.meta.build.finished = True current_config.save() adapter = get_indicator_adapter(config, raise_errors=True, can_handle_laboratory=True) adapter.after_table_build()
def _get_document(related_doc_type, doc_id, context): document_store = get_document_store(context.root_doc['domain'], related_doc_type) try: doc = document_store.get_document(doc_id) except DocumentNotFoundError: return None if context.root_doc['domain'] != doc.get('domain'): return None return doc
def _iteratively_build_table(config, resume_helper=None, in_place=False, limit=-1): resume_helper = resume_helper or DataSourceResumeHelper(config) indicator_config_id = config._id case_type_or_xmlns_list = config.get_case_type_or_xmlns_filter() completed_ct_xmlns = resume_helper.get_completed_case_type_or_xmlns() if completed_ct_xmlns: case_type_or_xmlns_list = [ case_type_or_xmlns for case_type_or_xmlns in case_type_or_xmlns_list if case_type_or_xmlns not in completed_ct_xmlns ] for case_type_or_xmlns in case_type_or_xmlns_list: relevant_ids = [] document_store = get_document_store( config.domain, config.referenced_doc_type, case_type_or_xmlns=case_type_or_xmlns) for i, relevant_id in enumerate(document_store.iter_document_ids()): if i >= limit > -1: break relevant_ids.append(relevant_id) if len(relevant_ids) >= ID_CHUNK_SIZE: _build_indicators(config, document_store, relevant_ids) relevant_ids = [] if relevant_ids: _build_indicators(config, document_store, relevant_ids) resume_helper.add_completed_case_type_or_xmlns(case_type_or_xmlns) resume_helper.clear_resume_info() if not id_is_static(indicator_config_id): if in_place: config.meta.build.finished_in_place = True else: config.meta.build.finished = True try: config.save() except ResourceConflict: current_config = DataSourceConfiguration.get(config._id) # check that a new build has not yet started if in_place: if config.meta.build.initiated_in_place == current_config.meta.build.initiated_in_place: current_config.meta.build.finished_in_place = True else: if config.meta.build.initiated == current_config.meta.build.initiated: current_config.meta.build.finished = True current_config.save() adapter = get_indicator_adapter(config, raise_errors=True, can_handle_laboratory=True) adapter.after_table_build()
def handle(self, domain, data_source_id, doc_id, **options): config, _ = get_datasource_config(data_source_id, domain) doc_type = config.referenced_doc_type doc_store = get_document_store(domain, doc_type) doc = doc_store.get_document(doc_id) sort_by = options['sort'] local_variables = {'config': config, 'doc': doc} cProfile.runctx('config.get_all_values(doc)', {}, local_variables, 'ucr_stats.log') print_profile_stats('ucr_stats.log', sort_by)
def get_value(self, doc_id, context): try: assert context.root_doc['domain'] document_store = get_document_store(context.root_doc['domain'], self.related_doc_type) doc = document_store.get_document(doc_id) # ensure no cross-domain lookups of different documents if context.root_doc['domain'] != doc.get('domain'): return None # explicitly use a new evaluation context since this is a new document return self._value_expression(doc, EvaluationContext(doc, 0)) except DocumentNotFoundError: return None
def resume_building_indicators(indicator_config_id, initiated_by=None): config = _get_config_by_id(indicator_config_id) success = _('Your UCR table {} has finished rebuilding').format(config.table_id) failure = _('There was an error rebuilding Your UCR table {}.').format(config.table_id) send = toggles.SEND_UCR_REBUILD_INFO.enabled(initiated_by) with notify_someone(initiated_by, success_message=success, error_message=failure, send=send): resume_helper = DataSourceResumeHelper(config) relevant_ids = resume_helper.get_ids_to_resume_from() if len(relevant_ids) > 0: _build_indicators(config, get_document_store(config.domain, config.referenced_doc_type), relevant_ids, resume_helper) last_id = relevant_ids[-1] iteratively_build_table(config, last_id, resume_helper)
def resume_building_indicators(indicator_config_id, initiated_by=None): config = _get_config_by_id(indicator_config_id) success = _('Your UCR table {} has finished rebuilding').format(config.table_id) failure = _('There was an error rebuilding Your UCR table {}.').format(config.table_id) send = toggles.SEND_UCR_REBUILD_INFO.enabled(initiated_by) with notify_someone(initiated_by, success_message=success, error_message=failure, send=send): resume_helper = DataSourceResumeHelper(config) relevant_ids = resume_helper.get_ids_to_resume_from() if len(relevant_ids) > 0: _build_indicators(config, get_document_store(config.domain, config.referenced_doc_type), relevant_ids, resume_helper) last_id = relevant_ids[-1] _iteratively_build_table(config, last_id, resume_helper)
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) for doc in doc_store.iter_documents(doc_ids): indicator = indicator_by_doc_id[doc['_id']] eval_context = EvaluationContext(doc) for config_id in indicator.indicator_config_ids: adapter = None try: config = _get_config(config_id) adapter = get_indicator_adapter(config, can_handle_laboratory=True) adapter.save(doc, eval_context) eval_context.reset_iteration() except (ESError, RequestError, ConnectionTimeout): # couch or es had an issue so don't log it and go on to the next doc failed_indicators.append(indicator.pk) break except Exception as e: # getting the config could fail before the adapter is set if adapter: adapter.handle_exception(doc, e) failed_indicators.append(indicator.pk) break else: processed_indicators.append(indicator.pk) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() AsyncIndicator.objects.filter(pk__in=failed_indicators).update( date_queued=None, unsuccessful_attempts=F('unsuccessful_attempts') + 1)
def save_document(doc_ids): lock_keys = [] for doc_id in doc_ids: lock_keys.append(get_async_indicator_modify_lock_key(doc_id)) indicator_config_ids = None timer = TimingContext() with CriticalSection(lock_keys): indicators = AsyncIndicator.objects.filter(doc_id__in=doc_ids) if not indicators: return first_indicator = indicators[0] processed_indicators = [] failed_indicators = [] for i in indicators: assert i.domain == first_indicator.domain assert i.doc_type == first_indicator.doc_type indicator_by_doc_id = {i.doc_id: i for i in indicators} doc_store = get_document_store(first_indicator.domain, first_indicator.doc_type) indicator_config_ids = first_indicator.indicator_config_ids with timer: for doc in doc_store.iter_documents(doc_ids): indicator = indicator_by_doc_id[doc['_id']] successfully_processed, to_remove = _save_document_helper(indicator, doc) if successfully_processed: processed_indicators.append(indicator.pk) else: failed_indicators.append((indicator, to_remove)) AsyncIndicator.objects.filter(pk__in=processed_indicators).delete() with transaction.atomic(): for indicator, to_remove in failed_indicators: indicator.update_failure(to_remove) indicator.save() datadog_histogram( 'commcare.async_indicator.processing_time', timer.duration, tags=[ u'config_ids:{}'.format(indicator_config_ids) ] )
def handle(self, domain, data_source_id, doc_id, **options): config, _ = get_datasource_config(data_source_id, domain) doc_type = config.referenced_doc_type doc_store = get_document_store(domain, doc_type) doc = doc_store.get_document(doc_id) local_variables = {'config': config, 'doc': doc} cProfile.runctx('config.get_all_values(doc)', {}, local_variables, 'ucr_stats.log') p = pstats.Stats('ucr_stats.log') p.sort_stats('time') print("Top 10 functions by time\n") p.print_stats(10) print("Specs timing\n") p.print_stats('userreports.*specs.*\(__call__\)') print("Socket recvs\n") p.print_stats('recv') print("Doc retrievals\n") p.print_stats('document_store.*\(get_document\)') print("Postgres queries\n") p.print_stats('execute.*psycopg') print("ES queries\n") p.print_stats('es_query.py.*\(run\)') print(""" Note: Due to overhead in profiling, these times are much larger than the real times. Next Steps: 1) choose one of the previous calls to investigate 2) use print_callees or print_callers to follow the calls * usage https://docs.python.org/2/library/profile.html#pstats.Stats.print_stats 3) check out branch je/time-ucr to get logs for processing time of each column (you'll likely need to rebase it on latest master) """)
def handle(self, domain, count, **options): sort_by = options['sort'] indicators = AsyncIndicator.objects.filter( domain=domain).order_by('-date_created')[:count] print('processing {} indicators'.format(len(indicators))) # build up data source configs and docs configs = {} docs = {} for indicator in indicators: docs[indicator.doc_id] = get_document_store( domain, indicator.doc_type).get_document(indicator.doc_id) for config_id in indicator.indicator_config_ids: configs[config_id] = _get_config(config_id) local_variables = { '_simulate_indicator_saves': _simulate_indicator_saves, 'indicators': indicators, 'docs': docs, 'configs': configs, } cProfile.runctx('_simulate_indicator_saves(indicators, docs, configs)', {}, local_variables, 'async_ucr_stats.log') print_profile_stats('async_ucr_stats.log', sort_by)
def evaluate_expression(request, domain): doc_type = request.POST['doc_type'] doc_id = request.POST['doc_id'] try: usable_type = { 'form': 'XFormInstance', 'case': 'CommCareCase', }.get(doc_type, 'Unknown') document_store = get_document_store(domain, usable_type) doc = document_store.get_document(doc_id) expression_text = request.POST['expression'] expression_json = json.loads(expression_text) parsed_expression = ExpressionFactory.from_spec(expression_json) result = parsed_expression(doc, EvaluationContext(doc)) return json_response({ "result": result, }) except DocumentNotFoundError: return json_response( { "error": _("{} with id {} not found in domain {}.").format( doc_type, doc_id, domain) }, status_code=404, ) except BadSpecError as e: return json_response( {"error": _("Problem with expression: {}.").format(e)}, status_code=400, ) except Exception as e: return json_response( {"error": unicode(e)}, status_code=500, )
def handle(self, domain, data_source_id, *args, **kwargs): config, _ = get_datasource_config(data_source_id, domain) adapter = get_indicator_adapter(config) q = adapter.get_query_object() document_store = get_document_store(domain, config.referenced_doc_type) bad_rows = [] for row in with_progress_bar(q, length=q.count()): doc_id = row.doc_id doc = document_store.get_document(doc_id) current_rows = config.get_all_values(doc) if len(current_rows) > 1: raise ValueError( "this command doesn't work for datasources returning multiple rows per doc" ) try: current_row = current_rows[0] except KeyError: continue # don't compare the 'inserted_at' columns current_row = [ val for val in current_row if val.column.database_column_name != 'inserted_at' ] for val in current_row: try: inserted_value = getattr(row, val.column.database_column_name) if (inserted_value != val.value or row.inserted_at.replace(tzinfo=pytz.utc) < parse_datetime(doc['server_modified_on'])): bad_rows.append({ 'doc_id': row.doc_id, 'column_name': val.column.database_column_name, 'inserted_at': row.inserted_at.isoformat(), 'server_modified_on': doc['server_modified_on'], 'stored_value': getattr(row, val.column.database_column_name), 'desired_value': val.value, 'message': ('column mismatch' if inserted_value != val.value else "modified date early"), }) except AttributeError: bad_rows.append({ 'doc_id': row.doc_id, 'column_name': val.column.database_column_name, 'inserted_at': 'missing', 'server_modified_on': doc['server_modified_on'], 'stored_value': 'missing', 'desired_value': val.value, 'message': 'doc missing', }) filename = 'datasource_mismatches_{}_{}.csv'.format( data_source_id[-8:], datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")) with open(filename, 'w') as f: headers = [ 'doc_id', 'column_name', 'inserted_at', 'server_modified_on', 'stored_value', 'desired_value', 'message' ] writer = csv.DictWriter(f, headers) writer.writeheader() writer.writerows(bad_rows) print("Found {} mismatches. Check {} for more details".format( len(bad_rows), filename))