Пример #1
0
    def get_iterator(self, missing_items=None):
        def data_provider(batch_number):
            try:
                return self.batches[batch_number]
            except IndexError:
                return []

        itr = ResumableFunctionIterator('test', data_provider, TestArgsProvider())
        itr.couch_db = self.couch_db
        return itr
Пример #2
0
def get_endkey_docid(domain, doc_type, migration_id):
    resume_key = "%s.%s.%s" % (domain, doc_type, migration_id)
    state = ResumableFunctionIterator(resume_key, None, None, None).state
    assert getattr(state, '_rev',
                   None), "rebuild not necessary (no resume state)"
    assert not state.complete, "iteration is complete"
    state_json = state.to_json()
    assert not state_json['args']
    kwargs = state_json['kwargs']
    return kwargs['startkey'], kwargs['startkey_docid']
Пример #3
0
class IterationState:
    statedb = attr.ib()
    domain = attr.ib()
    doc_type = attr.ib()

    def __attrs_post_init__(self):
        migration_id = self.statedb.unique_id
        resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id)
        self.itr = ResumableFunctionIterator(resume_key, None, None, None)

    @property
    def value(self):
        return self.itr.state.to_json()

    def backup_resume_state(self, value):
        """Attempt to save iteration state in state db

        :param value: iteration state dict. See `self.value`
        :returns: resume key if saved else `None`
        """
        assert isinstance(value, dict), value
        key = f"resume-{value['timestamp']}"
        pretty_value = json.dumps(value, indent=2)
        log.info("saving resume state with key=%s : %s", key, pretty_value)
        old = self.statedb.get(key)
        if old is None:
            self.statedb.set(key, value)
            log.info("saved.")
        elif old != value:
            log.warn("NOT SAVED! refusing to overwrite:\n%s", old)
            return None
        return key

    def restore_resume_state(self, key):
        """Attempt to restore resume state represented by key

        :returns: true if restored else false
        """
        new_state = self.statedb.get(key)
        if new_state is None:
            return False
        key = self.backup_resume_state(self.value)
        if key is None:
            return False
        log.info("restoring iteration state: %s", new_state)
        self.itr._save_state_json(new_state)
        return True

    def drop_from_couch(self):
        """Delete resume state from Couch"""
        try:
            self.itr.couch_db.delete_doc(self.itr.iteration_id)
        except ResourceNotFound:
            pass
Пример #4
0
 def __attrs_post_init__(self):
     migration_id = self.statedb.unique_id
     resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id)
     self.itr = ResumableFunctionIterator(resume_key, None, None, None)
     for method, regex in [
         ("case_rewind", r"^case-(\d+)$"),
         ("resume_rewind", r"^resume-"),
     ]:
         match = re.search(regex, self.move_to)
         if match:
             getattr(self, method)(match)
             break
     else:
         raise NotImplementedError(self.move_to)
Пример #5
0
    def get_iterator(self, missing_items=None):
        def data_provider(batch_number):
            try:
                return self.batches[batch_number]
            except IndexError:
                return []

        def item_getter(item_id):
            if missing_items and item_id in missing_items:
                return None
            return int(item_id)

        itr = ResumableFunctionIterator('test', data_provider, TestArgsProvider(), item_getter)
        itr.couch_db = self.couch_db
        return itr
Пример #6
0
def _iter_missing_ids(db, min_tries, resume_key, view_name, view_params, repair):
    def data_function(**view_kwargs):
        @retry_on_couch_error
        def get_doc_ids():
            results = list(db.view(view_name, **view_kwargs))
            if "limit" in view_kwargs and results:
                nonlocal last_result
                last_result = results[-1]
                replace_limit_with_endkey(view_kwargs, last_result)
            return {r["id"] for r in results}

        def replace_limit_with_endkey(view_kwargs, last_result):
            assert "endkey_docid" not in view_kwargs, view_kwargs
            view_kwargs.pop("limit")
            view_kwargs["endkey"] = last_result["key"]
            view_kwargs["endkey_docid"] = last_result["id"]

        last_result = None
        missing, tries = find_missing_ids(get_doc_ids, min_tries=min_tries)
        if last_result is None:
            log.debug("no results %s - %s", view_kwargs['startkey'], view_kwargs['endkey'])
            assert not missing
            return []
        if missing and repair:
            missing, tries2, repaired = repair_couch_docs(db, missing, get_doc_ids, min_tries)
            tries += tries2
        else:
            repaired = 0
        log.debug(f"{len(missing)}/{tries} start={view_kwargs['startkey']} {missing or ''}")
        last_result["missing_info"] = missing, tries, repaired
        return [last_result]

    args_provider = NoSkipArgsProvider(view_params)
    return ResumableFunctionIterator(resume_key, data_function, args_provider)
Пример #7
0
    def get_iterator(self, missing_items=None):
        def data_provider(batch_number):
            try:
                return self.batches[batch_number]
            except IndexError:
                return []

        def item_getter(item_id):
            if missing_items and item_id in missing_items:
                return None
            return int(item_id)

        itr = ResumableFunctionIterator('test', data_provider,
                                        TestArgsProvider(), item_getter)
        itr.couch_db = self.couch_db
        return itr
Пример #8
0
def _iter_docs(domain, doc_type, resume_key, stopper):
    def data_function(**view_kwargs):
        return couch_db.view('by_domain_doc_type_date/view', **view_kwargs)

    if "." in doc_type:
        doc_type, row_key = doc_type.split(".")
    else:
        row_key = "doc"

    if stopper.clean_break:
        return []
    couch_db = XFormInstance.get_db()
    args_provider = NoSkipArgsProvider({
        'startkey': [domain, doc_type],
        'endkey': [domain, doc_type, {}],
        'limit': _iter_docs.chunk_size,
        'include_docs': row_key == "doc",
        'reduce': False,
    })
    rows = ResumableFunctionIterator(
        resume_key,
        data_function,
        args_provider,
        item_getter=None,
        event_handler=MigrationPaginationEventHandler(domain, stopper))
    return (row[row_key] for row in rows)
Пример #9
0
def get_main_forms_iteration_stop_date(domain_name, migration_id):
    resume_key = "%s.%s.%s" % (domain_name, "XFormInstance", migration_id)
    itr = ResumableFunctionIterator(resume_key, None, None, None)
    kwargs = itr.state.kwargs
    assert kwargs, f"migration state not found: {resume_key}"
    # this is tightly coupled to by_domain_doc_type_date/view in couch:
    # the last key element is expected to be a datetime
    return kwargs["startkey"][-1]
Пример #10
0
def get_main_forms_iteration_stop_date(statedb):
    resume_key = f"{statedb.domain}.XFormInstance.{statedb.unique_id}"
    itr = ResumableFunctionIterator(resume_key, None, None, None)
    kwargs = itr.state.kwargs
    assert kwargs, f"migration state not found: {resume_key}"
    # this is tightly coupled to by_domain_doc_type_date/view in couch:
    # the last key element is expected to be a datetime
    return kwargs["startkey"][-1]
Пример #11
0
def _iter_skipped_form_ids(domain, migration_id, stopper, with_progress):
    resume_key = "%s.%s.%s" % (domain, "XFormInstance.id", migration_id)
    couch_ids = _iter_docs(domain, "XFormInstance.id", resume_key, stopper)
    couch_ids = with_progress(["XFormInstance"],
                              couch_ids,
                              "Scanning",
                              offset_key="XFormInstance.id")
    for batch in chunked(couch_ids, _iter_skipped_form_ids.chunk_size, list):
        yield from _drop_sql_form_ids(batch, domain)
    if not stopper.clean_break:
        # discard iteration state on successful completion so it is possible
        # to run another skipped forms iteration later
        ResumableFunctionIterator(resume_key, None, None, None).discard_state()
Пример #12
0
 def _get_paginated_iterable(data_function,
                             args_provider,
                             event_handler=None,
                             resumable_key=None):
     if resumable_key:
         return ResumableFunctionIterator(resumable_key,
                                          data_function,
                                          args_provider,
                                          lambda x: x.id,
                                          event_handler=event_handler)
     else:
         return paginate_function(data_function,
                                  args_provider,
                                  event_handler=event_handler)
Пример #13
0
def _iter_docs(domain, doc_type, resume_key, stopper):
    @retry_on_couch_error
    def data_function(**view_kwargs):
        view_name = 'by_domain_doc_type_date/view'
        results = list(couch_db.view(view_name, **view_kwargs))
        assert all(r['key'][0] == domain for r in results), \
            _repr_bad_results(view_name, view_kwargs, results, domain)
        return results

    if "." in doc_type:
        doc_type, row_key = doc_type.split(".")
    else:
        row_key = "doc"

    if stopper.clean_break:
        return []
    couch_db = XFormInstance.get_db()
    args_provider = NoSkipArgsProvider({
        'startkey': [domain, doc_type],
        'endkey': [domain, doc_type, {}],
        'limit': _iter_docs.chunk_size,
        'include_docs': row_key == "doc",
        'reduce': False,
    })
    rows = ResumableFunctionIterator(
        resume_key,
        data_function,
        args_provider,
        item_getter=None,
        event_handler=MigrationPaginationEventHandler(domain, stopper))
    if rows.state.is_resume():
        log.info("iteration state: %r", rows.state.to_json())
    row = None
    try:
        for row in rows:
            yield row[row_key]
    finally:
        if row is not None:
            row_copy = dict(row)
            row_copy.pop("doc", None)
            log.info("last item: %r", row_copy)
        log.info("final iteration state: %r", rows.state.to_json())
Пример #14
0
 def discard_iteration_state(resume_key):
     ResumableFunctionIterator(resume_key, None, None, None).discard_state()
Пример #15
0
 def __attrs_post_init__(self):
     migration_id = self.statedb.unique_id
     resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id)
     self.itr = ResumableFunctionIterator(resume_key, None, None, None)
Пример #16
0
class Rewinder:
    statedb = attr.ib()
    domain = attr.ib()
    doc_type = attr.ib()
    move_to = attr.ib()

    def __attrs_post_init__(self):
        migration_id = self.statedb.unique_id
        resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id)
        self.itr = ResumableFunctionIterator(resume_key, None, None, None)
        for method, regex in [
            ("case_rewind", r"^case-(\d+)$"),
            ("resume_rewind", r"^resume-"),
        ]:
            match = re.search(regex, self.move_to)
            if match:
                getattr(self, method)(match)
                break
        else:
            raise NotImplementedError(self.move_to)

    def resume_rewind(self, match):
        self.offset = None
        new_state = self.statedb.get(self.move_to)
        if new_state is None:
            sys.exit(1, "resume state not found")
        old_state = self.itr.state
        self._save_resume_state(old_state.to_json())
        log.info("restoring iteration state: %s", new_state)
        self.itr._save_state_json(new_state)

    def case_rewind(self, match):
        self.offset = int(match.group(1))
        self.stats = FormStats()

    def __iter__(self):
        def data_function(**view_kwargs):
            return couch_db.view('by_domain_doc_type_date/view', **view_kwargs)

        log.info("preparing to rewind: %s", self.move_to)
        state_json = self.itr.state.to_json()
        self._save_resume_state(state_json)
        couch_db = XFormInstance.get_db()
        args_provider = NoSkipArgsProvider({
            'startkey': state_json["kwargs"]["startkey"],
            'startkey_docid': state_json["kwargs"]["startkey_docid"],
            'endkey': [self.domain, self.doc_type],
            'descending': True,
            'limit': 1000,
            'include_docs': True,
            'reduce': False,
        })
        args, kwargs = args_provider.get_initial_args()
        while True:
            results = list(data_function(*args, **kwargs))
            results = args_provider.adjust_results(results, args, kwargs)
            if not results:
                break
            for result in results:
                yield get_received_on(result["doc"], self.stats)
            try:
                args, kwargs = args_provider.get_next_args(results[-1], *args, **kwargs)
            except StopIteration:
                break

    def save_state(self, received_on):
        state = self.itr.state
        startkey = state.kwargs["startkey"]
        assert len(startkey) == 3, startkey
        assert isinstance(startkey[-1], type(received_on)), (startkey, received_on)
        startkey[-1] = received_on
        assert state.kwargs["startkey"] is startkey, (state.kwargs, startkey)
        state.kwargs.pop("startkey_docid")
        state.timestamp = datetime.utcnow()
        self._save_resume_state(state.to_json())

    def _save_resume_state(self, state_json):
        assert isinstance(state_json, dict), state_json
        key = f"resume-{state_json['timestamp']}"
        log.info("saving resume state. restore with: rewind --to=%s\n%s",
                 key, state_json)
        old = self.statedb.get(key)
        if old is None:
            log.info("saved.")
            self.statedb.set(key, state_json)
        elif old != state_json:
            log.warn("NOT SAVED! refusing to overwrite:\n%s", old)