def get_iterator(self, missing_items=None): def data_provider(batch_number): try: return self.batches[batch_number] except IndexError: return [] itr = ResumableFunctionIterator('test', data_provider, TestArgsProvider()) itr.couch_db = self.couch_db return itr
def get_endkey_docid(domain, doc_type, migration_id): resume_key = "%s.%s.%s" % (domain, doc_type, migration_id) state = ResumableFunctionIterator(resume_key, None, None, None).state assert getattr(state, '_rev', None), "rebuild not necessary (no resume state)" assert not state.complete, "iteration is complete" state_json = state.to_json() assert not state_json['args'] kwargs = state_json['kwargs'] return kwargs['startkey'], kwargs['startkey_docid']
class IterationState: statedb = attr.ib() domain = attr.ib() doc_type = attr.ib() def __attrs_post_init__(self): migration_id = self.statedb.unique_id resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id) self.itr = ResumableFunctionIterator(resume_key, None, None, None) @property def value(self): return self.itr.state.to_json() def backup_resume_state(self, value): """Attempt to save iteration state in state db :param value: iteration state dict. See `self.value` :returns: resume key if saved else `None` """ assert isinstance(value, dict), value key = f"resume-{value['timestamp']}" pretty_value = json.dumps(value, indent=2) log.info("saving resume state with key=%s : %s", key, pretty_value) old = self.statedb.get(key) if old is None: self.statedb.set(key, value) log.info("saved.") elif old != value: log.warn("NOT SAVED! refusing to overwrite:\n%s", old) return None return key def restore_resume_state(self, key): """Attempt to restore resume state represented by key :returns: true if restored else false """ new_state = self.statedb.get(key) if new_state is None: return False key = self.backup_resume_state(self.value) if key is None: return False log.info("restoring iteration state: %s", new_state) self.itr._save_state_json(new_state) return True def drop_from_couch(self): """Delete resume state from Couch""" try: self.itr.couch_db.delete_doc(self.itr.iteration_id) except ResourceNotFound: pass
def __attrs_post_init__(self): migration_id = self.statedb.unique_id resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id) self.itr = ResumableFunctionIterator(resume_key, None, None, None) for method, regex in [ ("case_rewind", r"^case-(\d+)$"), ("resume_rewind", r"^resume-"), ]: match = re.search(regex, self.move_to) if match: getattr(self, method)(match) break else: raise NotImplementedError(self.move_to)
def get_iterator(self, missing_items=None): def data_provider(batch_number): try: return self.batches[batch_number] except IndexError: return [] def item_getter(item_id): if missing_items and item_id in missing_items: return None return int(item_id) itr = ResumableFunctionIterator('test', data_provider, TestArgsProvider(), item_getter) itr.couch_db = self.couch_db return itr
def _iter_missing_ids(db, min_tries, resume_key, view_name, view_params, repair): def data_function(**view_kwargs): @retry_on_couch_error def get_doc_ids(): results = list(db.view(view_name, **view_kwargs)) if "limit" in view_kwargs and results: nonlocal last_result last_result = results[-1] replace_limit_with_endkey(view_kwargs, last_result) return {r["id"] for r in results} def replace_limit_with_endkey(view_kwargs, last_result): assert "endkey_docid" not in view_kwargs, view_kwargs view_kwargs.pop("limit") view_kwargs["endkey"] = last_result["key"] view_kwargs["endkey_docid"] = last_result["id"] last_result = None missing, tries = find_missing_ids(get_doc_ids, min_tries=min_tries) if last_result is None: log.debug("no results %s - %s", view_kwargs['startkey'], view_kwargs['endkey']) assert not missing return [] if missing and repair: missing, tries2, repaired = repair_couch_docs(db, missing, get_doc_ids, min_tries) tries += tries2 else: repaired = 0 log.debug(f"{len(missing)}/{tries} start={view_kwargs['startkey']} {missing or ''}") last_result["missing_info"] = missing, tries, repaired return [last_result] args_provider = NoSkipArgsProvider(view_params) return ResumableFunctionIterator(resume_key, data_function, args_provider)
def _iter_docs(domain, doc_type, resume_key, stopper): def data_function(**view_kwargs): return couch_db.view('by_domain_doc_type_date/view', **view_kwargs) if "." in doc_type: doc_type, row_key = doc_type.split(".") else: row_key = "doc" if stopper.clean_break: return [] couch_db = XFormInstance.get_db() args_provider = NoSkipArgsProvider({ 'startkey': [domain, doc_type], 'endkey': [domain, doc_type, {}], 'limit': _iter_docs.chunk_size, 'include_docs': row_key == "doc", 'reduce': False, }) rows = ResumableFunctionIterator( resume_key, data_function, args_provider, item_getter=None, event_handler=MigrationPaginationEventHandler(domain, stopper)) return (row[row_key] for row in rows)
def get_main_forms_iteration_stop_date(domain_name, migration_id): resume_key = "%s.%s.%s" % (domain_name, "XFormInstance", migration_id) itr = ResumableFunctionIterator(resume_key, None, None, None) kwargs = itr.state.kwargs assert kwargs, f"migration state not found: {resume_key}" # this is tightly coupled to by_domain_doc_type_date/view in couch: # the last key element is expected to be a datetime return kwargs["startkey"][-1]
def get_main_forms_iteration_stop_date(statedb): resume_key = f"{statedb.domain}.XFormInstance.{statedb.unique_id}" itr = ResumableFunctionIterator(resume_key, None, None, None) kwargs = itr.state.kwargs assert kwargs, f"migration state not found: {resume_key}" # this is tightly coupled to by_domain_doc_type_date/view in couch: # the last key element is expected to be a datetime return kwargs["startkey"][-1]
def _iter_skipped_form_ids(domain, migration_id, stopper, with_progress): resume_key = "%s.%s.%s" % (domain, "XFormInstance.id", migration_id) couch_ids = _iter_docs(domain, "XFormInstance.id", resume_key, stopper) couch_ids = with_progress(["XFormInstance"], couch_ids, "Scanning", offset_key="XFormInstance.id") for batch in chunked(couch_ids, _iter_skipped_form_ids.chunk_size, list): yield from _drop_sql_form_ids(batch, domain) if not stopper.clean_break: # discard iteration state on successful completion so it is possible # to run another skipped forms iteration later ResumableFunctionIterator(resume_key, None, None, None).discard_state()
def _get_paginated_iterable(data_function, args_provider, event_handler=None, resumable_key=None): if resumable_key: return ResumableFunctionIterator(resumable_key, data_function, args_provider, lambda x: x.id, event_handler=event_handler) else: return paginate_function(data_function, args_provider, event_handler=event_handler)
def _iter_docs(domain, doc_type, resume_key, stopper): @retry_on_couch_error def data_function(**view_kwargs): view_name = 'by_domain_doc_type_date/view' results = list(couch_db.view(view_name, **view_kwargs)) assert all(r['key'][0] == domain for r in results), \ _repr_bad_results(view_name, view_kwargs, results, domain) return results if "." in doc_type: doc_type, row_key = doc_type.split(".") else: row_key = "doc" if stopper.clean_break: return [] couch_db = XFormInstance.get_db() args_provider = NoSkipArgsProvider({ 'startkey': [domain, doc_type], 'endkey': [domain, doc_type, {}], 'limit': _iter_docs.chunk_size, 'include_docs': row_key == "doc", 'reduce': False, }) rows = ResumableFunctionIterator( resume_key, data_function, args_provider, item_getter=None, event_handler=MigrationPaginationEventHandler(domain, stopper)) if rows.state.is_resume(): log.info("iteration state: %r", rows.state.to_json()) row = None try: for row in rows: yield row[row_key] finally: if row is not None: row_copy = dict(row) row_copy.pop("doc", None) log.info("last item: %r", row_copy) log.info("final iteration state: %r", rows.state.to_json())
def discard_iteration_state(resume_key): ResumableFunctionIterator(resume_key, None, None, None).discard_state()
def __attrs_post_init__(self): migration_id = self.statedb.unique_id resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id) self.itr = ResumableFunctionIterator(resume_key, None, None, None)
class Rewinder: statedb = attr.ib() domain = attr.ib() doc_type = attr.ib() move_to = attr.ib() def __attrs_post_init__(self): migration_id = self.statedb.unique_id resume_key = "%s.%s.%s" % (self.domain, self.doc_type, migration_id) self.itr = ResumableFunctionIterator(resume_key, None, None, None) for method, regex in [ ("case_rewind", r"^case-(\d+)$"), ("resume_rewind", r"^resume-"), ]: match = re.search(regex, self.move_to) if match: getattr(self, method)(match) break else: raise NotImplementedError(self.move_to) def resume_rewind(self, match): self.offset = None new_state = self.statedb.get(self.move_to) if new_state is None: sys.exit(1, "resume state not found") old_state = self.itr.state self._save_resume_state(old_state.to_json()) log.info("restoring iteration state: %s", new_state) self.itr._save_state_json(new_state) def case_rewind(self, match): self.offset = int(match.group(1)) self.stats = FormStats() def __iter__(self): def data_function(**view_kwargs): return couch_db.view('by_domain_doc_type_date/view', **view_kwargs) log.info("preparing to rewind: %s", self.move_to) state_json = self.itr.state.to_json() self._save_resume_state(state_json) couch_db = XFormInstance.get_db() args_provider = NoSkipArgsProvider({ 'startkey': state_json["kwargs"]["startkey"], 'startkey_docid': state_json["kwargs"]["startkey_docid"], 'endkey': [self.domain, self.doc_type], 'descending': True, 'limit': 1000, 'include_docs': True, 'reduce': False, }) args, kwargs = args_provider.get_initial_args() while True: results = list(data_function(*args, **kwargs)) results = args_provider.adjust_results(results, args, kwargs) if not results: break for result in results: yield get_received_on(result["doc"], self.stats) try: args, kwargs = args_provider.get_next_args(results[-1], *args, **kwargs) except StopIteration: break def save_state(self, received_on): state = self.itr.state startkey = state.kwargs["startkey"] assert len(startkey) == 3, startkey assert isinstance(startkey[-1], type(received_on)), (startkey, received_on) startkey[-1] = received_on assert state.kwargs["startkey"] is startkey, (state.kwargs, startkey) state.kwargs.pop("startkey_docid") state.timestamp = datetime.utcnow() self._save_resume_state(state.to_json()) def _save_resume_state(self, state_json): assert isinstance(state_json, dict), state_json key = f"resume-{state_json['timestamp']}" log.info("saving resume state. restore with: rewind --to=%s\n%s", key, state_json) old = self.statedb.get(key) if old is None: log.info("saved.") self.statedb.set(key, state_json) elif old != state_json: log.warn("NOT SAVED! refusing to overwrite:\n%s", old)