Пример #1
0
    def test_update_record(self):
        domain = 'test-update-record'
        doc_type = 'form'
        initial_data = {
            'd1': ['c1', 'c2'],
            'd2': ['c1'],
            'd3': ['c2']
        }
        AsyncIndicator.objects.bulk_create([
            AsyncIndicator(
                doc_id=doc_id, doc_type=doc_type, domain=domain, indicator_config_ids=sorted(config_ids))
            for doc_id, config_ids in six.iteritems(initial_data)
        ])
        updated_data = {
            'd2': ['c2'],
            'd3': ['c3'],
            'd4': ['c2', 'c1'],
            'd5': ['c4']
        }

        with self.assertNumQueries(3):
            # 3 queries, 1 for query, 1 for update, 1 for create
            doc_type_by_ids = {i: doc_type for i in ['d1', 'd2', 'd3', 'd4', 'd5']}
            AsyncIndicator.bulk_update_records(updated_data, domain, doc_type_by_ids)

        self.assertEqual(
            self._get_indicator_data(),
            {
                'd1': ['c1', 'c2'],
                'd2': ['c1', 'c2'],
                'd3': ['c2', 'c3'],
                'd4': ['c1', 'c2'],
                'd5': ['c4']
            }
        )
Пример #2
0
    def process_change(self, pillow_instance, change):
        self.bootstrap_if_needed()
        if change.deleted:
            # we don't currently support hard-deletions at all.
            # we may want to change this at some later date but seem ok for now.
            # see https://github.com/dimagi/commcare-hq/pull/6944 for rationale
            return

        domain = change.metadata.domain
        if not domain:
            # if no domain we won't save to any UCR table
            return

        async_tables = []
        doc = change.get_document()
        ensure_document_exists(change)
        ensure_matched_revisions(change)

        if doc is None:
            return

        eval_context = EvaluationContext(doc)
        for table in self.table_adapters_by_domain[domain]:
            if table.config.filter(doc):
                if table.run_asynchronous:
                    async_tables.append(table.config._id)
                else:
                    self._save_doc_to_table(table, doc, eval_context)
                    eval_context.reset_iteration()
            elif table.config.deleted_filter(doc):
                table.delete(doc)

        if async_tables:
            AsyncIndicator.update_indicators(change, async_tables)
Пример #3
0
    def handle(self, domain, case_type, data_source_ids, **options):
        configs = []
        for data_source_id in data_source_ids:
            config, _ = get_datasource_config(data_source_id, domain)
            assert config.asynchronous
            assert config.referenced_doc_type == CASE_DOC_TYPE
            configs.append(config)

        fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': domain}

        doc_store = get_document_store(domain, CASE_DOC_TYPE)
        case_accessor = doc_store.case_accessors

        case_ids = case_accessor.get_case_ids_in_domain(type=case_type)
        num_case_ids = len(case_ids)
        print("inserting %d docs" % num_case_ids)

        for config in configs:
            adapter = get_indicator_adapter(config, can_handle_laboratory=True)
            adapter.build_table()
            # normally called after rebuilding finishes
            adapter.after_table_build()

        config_ids = [config._id for config in configs]
        for i, case_id in enumerate(case_ids):
            change = FakeChange(case_id, fake_change_doc)
            AsyncIndicator.update_indicators(change, config_ids)
            if i % 1000 == 0:
                print("inserted %d / %d docs" % (i, num_case_ids))
Пример #4
0
    def test_update_record(self):
        domain = 'test-update-record'
        doc_type = 'form'
        initial_data = {'d1': ['c1', 'c2'], 'd2': ['c1'], 'd3': ['c2']}
        AsyncIndicator.objects.bulk_create([
            AsyncIndicator(doc_id=doc_id,
                           doc_type=doc_type,
                           domain=domain,
                           indicator_config_ids=sorted(config_ids))
            for doc_id, config_ids in initial_data.items()
        ])
        updated_data = {
            'd2': ['c2'],
            'd3': ['c3'],
            'd4': ['c2', 'c1'],
            'd5': ['c4']
        }

        with self.assertNumQueries(3):
            # 3 queries, 1 for query, 1 for update, 1 for create
            doc_type_by_ids = {
                i: doc_type
                for i in ['d1', 'd2', 'd3', 'd4', 'd5']
            }
            AsyncIndicator.bulk_update_records(updated_data, domain,
                                               doc_type_by_ids)

        self.assertEqual(
            self._get_indicator_data(), {
                'd1': ['c1', 'c2'],
                'd2': ['c1', 'c2'],
                'd3': ['c2', 'c3'],
                'd4': ['c1', 'c2'],
                'd5': ['c4']
            })
Пример #5
0
    def handle(self, domain, case_type, data_source_ids, **options):
        configs = []
        for data_source_id in data_source_ids:
            config, _ = get_datasource_config(data_source_id, domain)
            assert config.asynchronous
            assert config.referenced_doc_type == CASE_DOC_TYPE
            configs.append(config)

        fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': domain}

        for config in configs:
            adapter = get_indicator_adapter(config, can_handle_laboratory=True)
            adapter.build_table()
            # normally called after rebuilding finishes
            adapter.after_table_build()

        self.domain = domain
        self.case_type = case_type

        config_ids = [config._id for config in configs]
        for case_id in self._get_case_ids_to_process():
            change = FakeChange(case_id, fake_change_doc)
            AsyncIndicator.update_from_kafka_change(change, config_ids)

        for config in configs:
            if not config.is_static:
                config.meta.build.rebuilt_asynchronously = True
                config.save()
Пример #6
0
 def _save_ids(self, ids):
     if self.bulk:
         AsyncIndicator.bulk_creation(ids, self.referenced_type, self.domain, self.config_ids)
     else:
         for id_ in ids:
             change = FakeChange(id_, self.fake_change_doc)
             AsyncIndicator.update_from_kafka_change(change, self.config_ids)
Пример #7
0
    def _process_chunk_for_domain(self, domain, changes_chunk):
        adapters = list(self.table_adapters_by_domain[domain])
        changes_by_id = {change.id: change for change in changes_chunk}
        to_delete_by_adapter = defaultdict(list)
        rows_to_save_by_adapter = defaultdict(list)
        async_configs_by_doc_id = defaultdict(list)
        to_update = {change for change in changes_chunk if not change.deleted}
        retry_changes, docs = self.get_docs_for_changes(to_update, domain)
        change_exceptions = []

        for doc in docs:
            eval_context = EvaluationContext(doc)
            for adapter in adapters:
                if adapter.config.filter(doc):
                    if adapter.run_asynchronous:
                        async_configs_by_doc_id[doc['_id']].append(
                            adapter.config._id)
                    else:
                        try:
                            rows_to_save_by_adapter[adapter].extend(
                                adapter.get_all_values(doc, eval_context))
                        except Exception as e:
                            change_exceptions.append(
                                (changes_by_id[doc["_id"]], e))
                        eval_context.reset_iteration()
                elif adapter.config.deleted_filter(doc) or adapter.doc_exists(
                        doc):
                    to_delete_by_adapter[adapter].append(doc['_id'])

        # bulk delete by adapter
        to_delete = [c.id for c in changes_chunk if c.deleted]
        for adapter in adapters:
            delete_ids = to_delete_by_adapter[adapter] + to_delete
            try:
                adapter.bulk_delete(delete_ids)
            except Exception as ex:
                notify_exception(
                    None, "Error in deleting changes chunk {ids}: {ex}".format(
                        ids=delete_ids, ex=ex))
                retry_changes.update(
                    [c for c in changes_chunk if c.id in delete_ids])
        # bulk update by adapter
        for adapter, rows in six.iteritems(rows_to_save_by_adapter):
            try:
                adapter.save_rows(rows)
            except Exception as ex:
                notify_exception(
                    None, "Error in saving changes chunk {ids}: {ex}".format(
                        ids=[c.id for c in to_update], ex=repr(ex)))
                retry_changes.update(to_update)
        if async_configs_by_doc_id:
            doc_type_by_id = {
                _id: changes_by_id[_id].metadata.document_type
                for _id in async_configs_by_doc_id.keys()
            }
            AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain,
                                               doc_type_by_id)

        return retry_changes, change_exceptions
Пример #8
0
 def _save_ids(self, ids):
     if self.bulk:
         AsyncIndicator.bulk_creation(ids, self.referenced_type,
                                      self.domain, self.config_ids)
     else:
         for id_ in ids:
             change = FakeChange(id_, self.fake_change_doc)
             AsyncIndicator.update_from_kafka_change(
                 change, self.config_ids)
Пример #9
0
    def _process_chunk_for_domain(self, domain, changes_chunk):
        adapters = list(self.table_adapters_by_domain[domain])
        changes_by_id = {change.id: change for change in changes_chunk}
        to_delete_by_adapter = defaultdict(list)
        rows_to_save_by_adapter = defaultdict(list)
        async_configs_by_doc_id = defaultdict(list)
        to_update = {change for change in changes_chunk if not change.deleted}
        retry_changes, docs = self.get_docs_for_changes(to_update, domain)
        change_exceptions = []

        for doc in docs:
            eval_context = EvaluationContext(doc)
            for adapter in adapters:
                if adapter.config.filter(doc):
                    if adapter.run_asynchronous:
                        async_configs_by_doc_id[doc['_id']].append(adapter.config._id)
                    else:
                        try:
                            rows_to_save_by_adapter[adapter].extend(adapter.get_all_values(doc, eval_context))
                        except Exception as e:
                            change_exceptions.append((changes_by_id[doc["_id"]], e))
                        eval_context.reset_iteration()
                elif adapter.config.deleted_filter(doc) or adapter.doc_exists(doc):
                    to_delete_by_adapter[adapter].append(doc['_id'])

        # bulk delete by adapter
        to_delete = [c.id for c in changes_chunk if c.deleted]
        for adapter in adapters:
            delete_ids = to_delete_by_adapter[adapter] + to_delete
            try:
                adapter.bulk_delete(delete_ids)
            except Exception as ex:
                notify_exception(
                    None,
                    "Error in deleting changes chunk {ids}: {ex}".format(
                        ids=delete_ids, ex=ex))
                retry_changes.update([c for c in changes_chunk if c.id in delete_ids])
        # bulk update by adapter
        for adapter, rows in six.iteritems(rows_to_save_by_adapter):
            try:
                adapter.save_rows(rows)
            except Exception as ex:
                notify_exception(
                    None,
                    "Error in saving changes chunk {ids}: {ex}".format(
                        ids=[c.id for c in to_update], ex=repr(ex)))
                retry_changes.update(to_update)
        if async_configs_by_doc_id:
            doc_type_by_id = {
                _id: changes_by_id[_id].metadata.document_type
                for _id in async_configs_by_doc_id.keys()
            }
            AsyncIndicator.bulk_update_records(async_configs_by_doc_id, domain, doc_type_by_id)

        return retry_changes, change_exceptions
Пример #10
0
def _build_indicators(config, document_store, relevant_ids):
    adapter = get_indicator_adapter(config, raise_errors=True, load_source='build_indicators')

    for doc in document_store.iter_documents(relevant_ids):
        if config.asynchronous:
            AsyncIndicator.update_record(
                doc.get('_id'), config.referenced_doc_type, config.domain, [config._id]
            )
        else:
            # save is a noop if the filter doesn't match
            adapter.best_effort_save(doc)
Пример #11
0
def _build_indicators(config, document_store, relevant_ids):
    adapter = get_indicator_adapter(config, raise_errors=True, load_source='build_indicators')

    for doc in document_store.iter_documents(relevant_ids):
        if config.asynchronous:
            AsyncIndicator.update_record(
                doc.get('_id'), config.referenced_doc_type, config.domain, [config._id]
            )
        else:
            # save is a noop if the filter doesn't match
            adapter.best_effort_save(doc)
Пример #12
0
    def handle(self, *args, **options):
        fake_change_doc = {'doc_type': CASE_DOC_TYPE, 'domain': DOMAIN}

        for data_source_id in DATA_SOURCES:
            print("processing data source %s" % data_source_id)
            data_source, is_static = get_datasource_config(data_source_id, DOMAIN)
            assert is_static
            adapter = get_indicator_adapter(data_source)
            table = adapter.get_table()
            for case_id in self._get_case_ids_to_process(adapter, table, data_source_id):
                change = FakeChange(case_id, fake_change_doc)
                AsyncIndicator.update_from_kafka_change(change, [data_source_id])
Пример #13
0
    def _setup_docs_and_indicators(self):
        self.docs = [{
            "_id": str(i),
            "domain": self.domain.name,
            "doc_type": "CommCareCase",
            "name": 'doc_name_' + str(i),
            "color": 'doc_color_' + str(i)
        } for i in range(10)]
        self.doc_ids = [str(i) for i in range(10)]

        AsyncIndicator.bulk_creation([doc["_id"] for doc in self.docs],
                                     "CommCareCase", self.domain, [])
Пример #14
0
    def _setup_docs_and_indicators(self):
        self.docs = [
            {
                "_id": str(i),
                "domain": self.domain.name,
                "doc_type": "CommCareCase",
                "name": 'doc_name_' + str(i),
                "color": 'doc_color_' + str(i)
            }
            for i in range(10)
        ]
        self.doc_ids = [str(i) for i in range(10)]

        AsyncIndicator.bulk_creation(
            [doc["_id"] for doc in self.docs],
            "CommCareCase",
            self.domain,
            []
        )
Пример #15
0
    def process_change(self, change):
        self.bootstrap_if_needed()

        domain = change.metadata.domain
        if not domain or domain not in self.table_adapters_by_domain:
            # if no domain we won't save to any UCR table
            return

        if change.deleted:
            adapters = list(self.table_adapters_by_domain[domain])
            for table in adapters:
                table.delete({'_id': change.metadata.document_id})

        async_tables = []
        doc = change.get_document()
        ensure_document_exists(change)
        ensure_matched_revisions(change, doc)

        if doc is None:
            return

        with TimingContext() as timer:
            eval_context = EvaluationContext(doc)
            # make copy to avoid modifying list during iteration
            adapters = list(self.table_adapters_by_domain[domain])
            doc_subtype = change.metadata.document_subtype
            for table in adapters:
                if table.config.filter(doc, eval_context):
                    if table.run_asynchronous:
                        async_tables.append(table.config._id)
                    else:
                        self._save_doc_to_table(domain, table, doc, eval_context)
                        eval_context.reset_iteration()
                elif (doc_subtype is None
                        or doc_subtype in table.config.get_case_type_or_xmlns_filter()):
                    table.delete(doc)

            if async_tables:
                AsyncIndicator.update_from_kafka_change(change, async_tables)

        self.domain_timing_context.update(**{
            domain: timer.duration
        })
Пример #16
0
    def process_change(self, change):
        self.bootstrap_if_needed()

        domain = change.metadata.domain
        if not domain or domain not in self.table_adapters_by_domain:
            # if no domain we won't save to any UCR table
            return

        if change.deleted:
            adapters = list(self.table_adapters_by_domain[domain])
            for table in adapters:
                table.delete({'_id': change.metadata.document_id})

        async_tables = []
        doc = change.get_document()
        ensure_document_exists(change)
        ensure_matched_revisions(change, doc)

        if doc is None:
            return

        with TimingContext() as timer:
            eval_context = EvaluationContext(doc)
            # make copy to avoid modifying list during iteration
            adapters = list(self.table_adapters_by_domain[domain])
            for table in adapters:
                if table.config.filter(doc):
                    if table.run_asynchronous:
                        async_tables.append(table.config._id)
                    else:
                        self._save_doc_to_table(domain, table, doc, eval_context)
                        eval_context.reset_iteration()
                elif table.config.deleted_filter(doc) or table.doc_exists(doc):
                    table.delete(doc)

            if async_tables:
                AsyncIndicator.update_from_kafka_change(change, async_tables)

        self.domain_timing_context.update(**{
            domain: timer.duration
        })
def create_async_indicator(doc_id):
    assert isinstance(doc_id, six.text_type)
    return AsyncIndicator(doc_id=doc_id,
                          doc_type="CommCareCase",
                          domain=DOMAIN,
                          indicator_config_ids=[DATA_SOURCE_NAME])
Пример #18
0
    def _process_chunk_for_domain(self, domain, changes_chunk):
        adapters = list(self.table_adapters_by_domain[domain])
        changes_by_id = {change.id: change for change in changes_chunk}
        to_delete_by_adapter = defaultdict(list)
        rows_to_save_by_adapter = defaultdict(list)
        async_configs_by_doc_id = defaultdict(list)
        to_update = {change for change in changes_chunk if not change.deleted}
        with self._datadog_timing('extract'):
            retry_changes, docs = self.get_docs_for_changes(to_update, domain)
        change_exceptions = []

        with self._datadog_timing('single_batch_transform'):
            for doc in docs:
                change = changes_by_id[doc['_id']]
                doc_subtype = change.metadata.document_subtype
                eval_context = EvaluationContext(doc)
                with self._datadog_timing('single_doc_transform'):
                    for adapter in adapters:
                        with self._datadog_timing('transform',
                                                  adapter.config._id):
                            if adapter.config.filter(doc, eval_context):
                                if adapter.run_asynchronous:
                                    async_configs_by_doc_id[doc['_id']].append(
                                        adapter.config._id)
                                else:
                                    try:
                                        rows_to_save_by_adapter[
                                            adapter].extend(
                                                adapter.get_all_values(
                                                    doc, eval_context))
                                    except Exception as e:
                                        change_exceptions.append((change, e))
                                    eval_context.reset_iteration()
                            elif (doc_subtype is None
                                  or doc_subtype in adapter.config.
                                  get_case_type_or_xmlns_filter()):
                                # Delete if the subtype is unknown or
                                # if the subtype matches our filters, but the full filter no longer applies
                                to_delete_by_adapter[adapter].append(doc)

        with self._datadog_timing('single_batch_delete'):
            # bulk delete by adapter
            to_delete = [{'_id': c.id} for c in changes_chunk if c.deleted]
            for adapter in adapters:
                delete_docs = to_delete_by_adapter[adapter] + to_delete
                if not delete_docs:
                    continue
                with self._datadog_timing('delete', adapter.config._id):
                    try:
                        adapter.bulk_delete(delete_docs)
                    except Exception:
                        delete_ids = [doc['_id'] for doc in delete_docs]
                        retry_changes.update(
                            [c for c in changes_chunk if c.id in delete_ids])

        with self._datadog_timing('single_batch_load'):
            # bulk update by adapter
            for adapter, rows in rows_to_save_by_adapter.items():
                with self._datadog_timing('load', adapter.config._id):
                    try:
                        adapter.save_rows(rows)
                    except Exception:
                        retry_changes.update(to_update)

        if async_configs_by_doc_id:
            with self._datadog_timing('async_config_load'):
                doc_type_by_id = {
                    _id: changes_by_id[_id].metadata.document_type
                    for _id in async_configs_by_doc_id.keys()
                }
                AsyncIndicator.bulk_update_records(async_configs_by_doc_id,
                                                   domain, doc_type_by_id)

        return retry_changes, change_exceptions