class Migrator: ''' Reindex/Migration... Reindex is going to behave much the same as migration would so we're using this for both cases most of the time... In order to do a *live* reindex, we need to follow these steps... 1. Create a next index - if already exists, fail - unless, "force" option provided 2. Put new mapping on next index 3. Mark that there is a new next index on the container object 4. All new index/delete operations are done on... - new index - and existing index - ... existing queries stay on current index 5. Copy existing index data over to the next index 6. Get a list of all existing doc ids on index 7. Take diff of existing mapping to new mapping 8. Crawl content - check if doc does not exist - make sure it wasn't added in the mean time - if it was, do an update with diff instead - record it - do complete index - if doc exist - if diff mapping exists - update fields in diff on doc - else, do nothing - remove for list of existing doc ids 9. Go through list of existing doc ids - double check not on container(query db) - delete doc if not in container - record it 10. Refresh db container ob 11. Point alias at next index 12. Delete old index ''' def __init__( self, utility, context, response=noop_response, force=False, log_details=False, memory_tracking=False, request=None, bulk_size=40, full=False, reindex_security=False, mapping_only=False, # noqa index_manager=None, children_only=False, lookup_index=False, cache=True): self.utility = utility self.conn = utility.conn self.context = context self.response = response self.force = force self.full = full self.log_details = log_details self.memory_tracking = memory_tracking self.bulk_size = bulk_size self.reindex_security = reindex_security self.children_only = children_only self.lookup_index = lookup_index if mapping_only and full: raise Exception( 'Can not do a full reindex and a mapping only migration') self.mapping_only = mapping_only if request is None: self.request = get_current_request() else: self.request = request if not cache: # make sure that we don't cache requests... self.request._txn._cache = DummyCache(self.request._txn) self.container = self.request.container if index_manager is None: self.index_manager = get_adapter(self.container, IIndexManager) else: self.index_manager = index_manager self.interaction = IInteraction(self.request) self.indexer = Indexer() self.batch = {} self.indexed = 0 self.processed = 0 self.missing = [] self.orphaned = [] self.existing = [] self.errors = [] self.mapping_diff = {} self.start_time = self.index_start_time = time.time() self.reindex_futures = [] self.status = 'started' self.active_task_id = None self.copied_docs = 0 self.work_index_name = None self.sub_indexes = [] def per_sec(self): return self.processed / (time.time() - self.index_start_time) async def create_next_index(self): async with managed_transaction(self.request, write=True, adopt_parent_txn=True) as txn: await txn.refresh(await self.index_manager.get_registry()) next_index_name = await self.index_manager.start_migration() if await self.conn.indices.exists(next_index_name): if self.force: # delete and recreate self.response.write('Clearing index') resp = await self.conn.indices.delete(next_index_name) assert resp['acknowledged'] await self.utility.create_index(next_index_name, self.index_manager) return next_index_name async def copy_to_next_index(self): conn_es = await self.conn.transport.get_connection() real_index_name = await self.index_manager.get_index_name() async with conn_es.session.post( join(str(conn_es.base_url), '_reindex'), params={'wait_for_completion': 'false'}, headers={'Content-Type': 'application/json'}, data=json.dumps({ "source": { "index": real_index_name, "size": 100 }, "dest": { "index": self.work_index_name } })) as resp: data = await resp.json() self.active_task_id = task_id = data['task'] while True: await asyncio.sleep(10) async with conn_es.session.get( join(str(conn_es.base_url), '_tasks', task_id), headers={'Content-Type': 'application/json'}) as resp: if resp.status in (400, 404): break data = await resp.json() if data['completed']: break status = data["task"]["status"] self.response.write( f'{status["created"]}/{status["total"]} - ' f'Copying data to new index. task id: {task_id}') self.copied_docs = status["created"] self.active_task_id = None response = data['response'] failures = response['failures'] if len(failures) > 0: failures = json.dumps(failures, sort_keys=True, indent=4, separators=(',', ': ')) self.response.write( f'Reindex encountered failures: {failures}') else: self.response.write( f'Finished copying to new index: {self.copied_docs}') async def get_all_uids(self): self.response.write('Retrieving existing doc ids') page_size = 3000 ids = [] index_name = await self.index_manager.get_index_name() result = await self.conn.search(index=index_name, scroll='2m', size=page_size, stored_fields='', _source=False, body={"sort": ["_doc"]}) ids.extend([r['_id'] for r in result['hits']['hits']]) scroll_id = result['_scroll_id'] while scroll_id: result = await self.utility.conn.scroll(scroll_id=scroll_id, scroll='2m') if len(result['hits']['hits']) == 0: break ids.extend([r['_id'] for r in result['hits']['hits']]) self.response.write(f'Retrieved {len(ids)} doc ids') scroll_id = result['_scroll_id'] self.response.write( f'Retrieved {len(ids)}. Copied {self.copied_docs} docs') return ids async def calculate_mapping_diff(self): ''' all we care about is new fields... Missing ones are ignored and we don't care about it. ''' next_mappings = await self.conn.indices.get_mapping( self.work_index_name) next_mappings = next_mappings[self.work_index_name]['mappings'] next_mappings = next_mappings[DOC_TYPE]['properties'] existing_index_name = await self.index_manager.get_real_index_name() try: existing_mappings = await self.conn.indices.get_mapping( existing_index_name) except elasticsearch.exceptions.NotFoundError: # allows us to upgrade when no index is present yet return next_mappings existing_mappings = existing_mappings[existing_index_name]['mappings'] existing_mappings = existing_mappings[DOC_TYPE]['properties'] new_definitions = {} for field_name, definition in next_mappings.items(): definition = _clean_mapping(definition) if (field_name not in existing_mappings or definition != _clean_mapping(existing_mappings[field_name])): # noqa new_definitions[field_name] = definition return new_definitions async def process_folder(self, ob): for key in await ob.async_keys(): try: item = await ob._p_jar.get_child(ob, key) except (KeyError, ModuleNotFoundError): continue if item is None: continue await self.process_object(item) del item del ob async def process_object(self, ob): ''' - check if doc does not exist - record it - do complete index - if doc exist - if diff mapping exists - update fields in diff on doc - else, do nothing - remove for list of existing doc ids ''' full = False if ob.uuid not in self.existing: self.missing.append(ob.uuid) full = True else: self.existing.remove(ob.uuid) await self.index_object(ob, full=full) self.processed += 1 if IIndexActive.providedBy(ob): self.sub_indexes.append(ob) else: if IFolder.providedBy(ob): await self.process_folder(ob) if not IContainer.providedBy(ob): try: del self.container.__gannotations__ except AttributeError: del self.container.__annotations__ del ob async def index_object(self, ob, full=False): batch_type = 'update' if self.reindex_security: try: data = ISecurityInfo(ob)() except TypeError: self.response.write(f'Could not index {ob}') return elif full or self.full: try: data = await ICatalogDataAdapter(ob)() except TypeError: self.response.write(f'Could not index {ob}') return batch_type = 'index' else: data = { # always need these... 'type_name': ob.type_name } for index_name in self.mapping_diff.keys(): val = await self.indexer.get_value(ob, index_name) if val is not None: data[index_name] = val if ob._p_serial: data['tid'] = ob._p_serial self.indexed += 1 self.batch[ob.uuid] = {'action': batch_type, 'data': data} if self.lookup_index: im = find_index_manager(ob) if im: self.batch[ob.uuid]['__index__'] = await im.get_index_name() if self.log_details: self.response.write( f'({self.processed} {int(self.per_sec())}) ' f'Object: {get_content_path(ob)}, ' f'Type: {batch_type}, Buffer: {len(self.batch)}') await self.attempt_flush() async def attempt_flush(self): if self.processed % 500 == 0: self.interaction.invalidate_cache() num, _, _ = gc.get_count() gc.collect() if self.memory_tracking: total_memory = round( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0, 1) # noqa self.response.write( b'Memory usage: % 2.2f MB, cleaned: %d, total in-memory obs: %d' % ( # noqa total_memory, num, len(gc.get_objects()))) self.response.write(b'Indexing new batch, totals: (%d %d/sec)\n' % ( # noqa self.indexed, int(self.per_sec()), )) if len(self.batch) >= self.bulk_size: await notify( IndexProgress(self.request, self.context, self.processed, (len(self.existing) + len(self.missing)))) await self.flush() async def join_futures(self): for future in self.reindex_futures: if not future.done(): await asyncio.wait_for(future, None) self.reindex_futures = [] @backoff.on_exception( backoff.constant, (asyncio.TimeoutError, elasticsearch.exceptions.ConnectionTimeout), interval=1, max_tries=5) async def _index_batch(self, batch): bulk_data = [] for _id, payload in batch.items(): index = payload.pop('__index__', self.work_index_name) action_data = {'_index': index, '_id': _id} data = payload['data'] if payload['action'] == 'update': data = {'doc': data} action_data['_retry_on_conflict'] = 3 bulk_data.append({payload['action']: action_data}) if payload['action'] != 'delete': bulk_data.append(data) results = await self.utility.conn.bulk(index=self.work_index_name, doc_type=DOC_TYPE, body=bulk_data) if results['errors']: errors = [] for result in results['items']: for key, value in result.items(): if not isinstance(value, dict): continue if 'status' in value and value['status'] != 200: _id = value.get('_id') # retry conflict errors and thread pool exceeded errors if value['status'] in (409, 429): self.batch[_id] = batch[_id] elif value['status'] == 404: self.batch[_id] = batch[_id] self.batch[_id]['action'] = 'index' else: errors.append(f'{_id}: {value["status"]}') if len(errors) > 0: logger.warning(f'Error bulk putting: {errors}') async def flush(self): if len(self.batch) == 0: # nothing to flush return future = asyncio.ensure_future(self._index_batch(self.batch)) self.batch = {} self.reindex_futures.append(future) if len(self.reindex_futures) > 7: await self.join_futures() async def check_existing(self): ''' Go through self.existing and see why it wasn't processed ''' for uuid in self.existing: try: ob = await self.context._p_jar.get(uuid) except KeyError: ob = None if ob is None: try: self.batch[uuid] = {'action': 'delete', 'data': {}} await self.attempt_flush() # no longer present on db, this was orphaned self.orphaned.append(uuid) except aioelasticsearch.exceptions.NotFoundError: # it was deleted in the meantime so we're actually okay self.orphaned.append(uuid) else: # XXX this should not happen so log it. Maybe we'll try # doing something about it another time... self.errors.append({'type': 'unprocessed', 'uuid': uuid}) async def setup_next_index(self): self.response.write(b'Creating new index') async with get_migration_lock(await self.index_manager.get_index_name()): self.work_index_name = await self.create_next_index() return self.work_index_name async def cancel_migration(self): # canceling the migration, clearing index self.response.write('Canceling migration') async with managed_transaction(self.request, write=True, adopt_parent_txn=True): await self.index_manager.cancel_migration() self.response.write('Next index disabled') if self.active_task_id is not None: self.response.write('Canceling copy of index task') conn_es = await self.conn.transport.get_connection() async with conn_es.session.post( join(str(conn_es.base_url), '_tasks', self.active_task_id, '_cancel'), headers={'Content-Type': 'application/json'}): await asyncio.sleep(5) if self.work_index_name: self.response.write('Deleting new index') await self.conn.indices.delete(self.work_index_name) self.response.write('Migration canceled') async def run_migration(self): alias_index_name = await self.index_manager.get_index_name() existing_index = await self.index_manager.get_real_index_name() await self.setup_next_index() self.mapping_diff = await self.calculate_mapping_diff() diff = json.dumps(self.mapping_diff, sort_keys=True, indent=4, separators=(',', ': ')) self.response.write(f'Caculated mapping diff: {diff}') if not self.full: # if full, we're reindexing everything does not matter what # anyways, so skip self.response.write(f'Copying initial index {existing_index} ' f'into {self.work_index_name}') try: await self.copy_to_next_index() self.response.write('Copying initial index data finished') except elasticsearch.exceptions.NotFoundError: self.response.write('No initial index to copy to') if not self.mapping_only: try: self.existing = await self.get_all_uids() except elasticsearch.exceptions.NotFoundError: pass self.index_start_time = time.time() if self.children_only or IContainer.providedBy(self.context): await self.process_folder(self.context) # this is recursive else: await self.process_object(self.context) # this is recursive await self.check_existing() await self.flush() await self.join_futures() async with get_migration_lock(await self.index_manager.get_index_name()): self.response.write('Activating new index') async with managed_transaction(self.request, write=True, adopt_parent_txn=True): await self.index_manager.finish_migration() self.status = 'done' self.response.write(f'''Update alias({alias_index_name}): {existing_index} -> {self.work_index_name} ''') try: await self.conn.indices.update_aliases({ "actions": [{ "remove": { "alias": alias_index_name, "index": existing_index } }, { "add": { "alias": alias_index_name, "index": self.work_index_name } }] }) except elasticsearch.exceptions.NotFoundError: await self.conn.indices.update_aliases({ "actions": [{ "add": { "alias": alias_index_name, "index": self.work_index_name } }] }) try: await self.conn.indices.close(existing_index) await self.conn.indices.delete(existing_index) self.response.write('Old index deleted') except elasticsearch.exceptions.NotFoundError: pass if len(self.sub_indexes) > 0: self.response.write( f'Migrating sub indexes: {len(self.sub_indexes)}') for ob in self.sub_indexes: im = get_adapter(ob, IIndexManager) migrator = Migrator(self.utility, ob, response=self.response, force=self.force, log_details=self.log_details, memory_tracking=self.memory_tracking, request=self.request, bulk_size=self.bulk_size, full=self.full, reindex_security=self.reindex_security, mapping_only=self.mapping_only, index_manager=im, children_only=True) self.response.write(f'Migrating index for: {ob}') await migrator.run_migration()
class Migrator: ''' Reindex/Migration... Reindex is going to behave much the same as migration would so we're using this for both cases most of the time... In order to do a *live* reindex, we need to follow these steps... 1. Create a next index - if already exists, fail - unless, "force" option provided 2. Put new mapping on next index 3. Mark that there is a new next index on the container object 4. All new index/delete operations are done on... - new index - and existing index - ... existing queries stay on current index 5. Copy existing index data over to the next index 6. Get a list of all existing doc ids on index 7. Take diff of existing mapping to new mapping 8. Crawl content - check if doc does not exist - make sure it wasn't added in the mean time - if it was, do an update with diff instead - record it - do complete index - if doc exist - if diff mapping exists - update fields in diff on doc - else, do nothing - remove for list of existing doc ids 9. Go through list of existing doc ids - double check not on container(query db) - delete doc if not in container - record it 10. Refresh db container ob 11. Point alias at next index 12. Delete old index TODO: - optionally fill metadata in indexing - requires more work... ''' def __init__(self, utility, context, response=noop_response, force=False, log_details=False, memory_tracking=False, request=None, bulk_size=40, full=False, reindex_security=False, mapping_only=False): self.utility = utility self.conn = utility.conn self.context = context self.response = response self.force = force self.full = full self.log_details = log_details self.memory_tracking = memory_tracking self.bulk_size = bulk_size self.reindex_security = reindex_security if mapping_only and full: raise Exception( 'Can not do a full reindex and a mapping only migration') self.mapping_only = mapping_only if request is None: self.request = get_current_request() else: self.request = request # make sure that we don't cache requests... self.request._txn._cache = DummyCache(self.request._txn) self.container = self.request.container self.interaction = IInteraction(self.request) self.indexer = Indexer() self.batch = {} self.indexed = 0 self.processed = 0 self.missing = [] self.orphaned = [] self.existing = [] self.errors = [] self.mapping_diff = {} self.start_time = self.index_start_time = time.time() self.reindex_futures = [] self.status = 'started' self.active_task_id = None self.copied_docs = 0 self.work_index_name = None def per_sec(self): return self.processed / (time.time() - self.index_start_time) async def create_next_index(self): version = await self.utility.get_version(self.container, request=self.request) next_version = version + 1 index_name = await self.utility.get_index_name(self.container, request=self.request) next_index_name = index_name + '_' + str(next_version) if await self.conn.indices.exists(next_index_name): if self.force: # delete and recreate self.response.write('Clearing index') resp = await self.conn.indices.delete(next_index_name) assert resp['acknowledged'] await self.conn.indices.create(next_index_name) return next_version, next_index_name async def copy_to_next_index(self): conn_es = await self.conn.transport.get_connection() real_index_name = await self.utility.get_index_name( self.container, self.request) async with conn_es._session.post( str(conn_es._base_url) + '_reindex', params={'wait_for_completion': 'false'}, data=json.dumps({ "source": { "index": real_index_name, "size": 100 }, "dest": { "index": self.work_index_name } })) as resp: data = await resp.json() self.active_task_id = task_id = data['task'] while True: await asyncio.sleep(10) async with conn_es._session.get( str(conn_es._base_url) + '_tasks/' + task_id) as resp: if resp.status in (400, 404): break data = await resp.json() if data['completed']: break status = data["task"]["status"] self.response.write( f'{status["created"]}/{status["total"]} - ' f'Copying data to new index. task id: {task_id}') self.copied_docs = status["created"] self.active_task_id = None response = data['response'] failures = response['failures'] if len(failures) > 0: failures = json.dumps(failures, sort_keys=True, indent=4, separators=(',', ': ')) self.response.write( f'Reindex encountered failures: {failures}') else: self.response.write( f'Finished copying to new index: {self.copied_docs}') async def get_all_uids(self): self.response.write('Retrieving existing doc ids') page_size = 3000 ids = [] index_name = await self.utility.get_index_name(self.container) result = await self.conn.search(index=index_name, scroll='2m', size=page_size, stored_fields='', body={"sort": ["_doc"]}) ids.extend([r['_id'] for r in result['hits']['hits']]) scroll_id = result['_scroll_id'] while scroll_id: result = await self.utility.conn.scroll(scroll_id=scroll_id, scroll='2m') if len(result['hits']['hits']) == 0: break ids.extend([r['_id'] for r in result['hits']['hits']]) self.response.write(f'Retrieved {len(ids)} doc ids') scroll_id = result['_scroll_id'] self.response.write( f'Retrieved {len(ids)}. Copied {self.copied_docs} docs') return ids async def calculate_mapping_diff(self): ''' all we care about is new fields... Missing ones are ignored and we don't care about it. ''' diffs = {} existing_index_name = await self.utility.get_real_index_name( self.container, self.request) existing_mappings = await self.conn.indices.get_mapping( existing_index_name) existing_mappings = existing_mappings[existing_index_name]['mappings'] next_mappings = await self.conn.indices.get_mapping( self.work_index_name) next_mappings = next_mappings[self.work_index_name]['mappings'] changes = False for type_name in existing_mappings.keys(): if type_name not in next_mappings: # copy over orphaned type otherwise move will potentially not work # any orphaned doc types will need to be manually deleted for now... mapping = existing_mappings[type_name] properties = mapping['properties'] # need to make sure to normalize field definitions so they are inline # with new mappings otherwise you could get conflicting definitions for field_name in properties.keys(): for check_type_name in next_mappings.keys(): if field_name in next_mappings[check_type_name][ 'properties']: properties[field_name] = next_mappings[ check_type_name]['properties'][field_name] break # and install new mapping await self.utility.conn.indices.put_mapping( self.work_index_name, type_name, mapping) changes = True if changes: # we add to the mappings so we need to update... next_mappings = await self.conn.indices.get_mapping( self.work_index_name) next_mappings = next_mappings[self.work_index_name]['mappings'] for type_name, schema in get_utilities_for(IResourceFactory): new_definitions = {} if type_name not in existing_mappings: diffs[type_name] = next_mappings[type_name]['properties'] continue existing_mapping = existing_mappings[type_name]['properties'] next_mapping = next_mappings[type_name]['properties'] for field_name, definition in next_mapping.items(): definition = _clean_mapping(definition) if (field_name not in existing_mapping or definition != _clean_mapping(existing_mapping[field_name])): new_definitions[field_name] = definition if len(new_definitions) > 0: diffs[type_name] = new_definitions for type_name, mapping in existing_mappings.items(): if type_name not in next_mappings: # special case here... we need to import this mapping still # in order for the index copy to work correctly if docs ref it self.response.write( f'Backporting mapping of {type_name} to new ' f'even though it is not defined anymore') await self.conn.indices.put_mapping(self.work_index_name, type_name, mapping) return diffs async def process_folder(self, ob): for key in await ob.async_keys(): try: item = await ob._p_jar.get_child(ob, key) except KeyError: continue if item is None: continue await self.process_object(item) del item del ob async def process_object(self, ob): ''' - check if doc does not exist - record it - do complete index - if doc exist - if diff mapping exists - update fields in diff on doc - else, do nothing - remove for list of existing doc ids ''' clear_conn_statement_cache(await ob._p_jar.get_connection()) full = False if ob.uuid not in self.existing: self.missing.append(ob.uuid) full = True else: self.existing.remove(ob.uuid) await self.index_object(ob, full=full) self.processed += 1 if IFolder.providedBy(ob): await self.process_folder(ob) if not IContainer.providedBy(ob): del ob.__annotations__ del ob async def index_object(self, ob, full=False): batch_type = 'update' if self.reindex_security: try: data = ISecurityInfo(ob)() except TypeError: self.response.write(f'could not index {ob}') return elif full or self.full: try: data = await ICatalogDataAdapter(ob)() except TypeError: self.response.write(f'could not index {ob}') return batch_type = 'index' else: if ob.type_name not in self.mapping_diff: # no fields change, ignore this guy... if self.log_details: self.response.write( f'({self.processed} {int(self.per_sec())}) ' f'(skipped) Object: {get_content_path(ob)}, ' f'Type: {batch_type}, Buffer: {len(self.batch)}') return data = { 'type_name': ob.type_name # always need this one... } for index_name in self.mapping_diff[ob.type_name].keys(): val = await self.indexer.get_value(ob, index_name) if val is not None: data[index_name] = val if ob._p_serial: data['tid'] = ob._p_serial self.indexed += 1 self.batch[ob.uuid] = {'action': batch_type, 'data': data} if self.log_details: self.response.write( f'({self.processed} {int(self.per_sec())}) ' f'Object: {get_content_path(ob)}, ' f'Type: {batch_type}, Buffer: {len(self.batch)}') await self.attempt_flush() async def attempt_flush(self): if self.processed % 500 == 0: self.interaction.invalidate_cache() num, _, _ = gc.get_count() gc.collect() if self.memory_tracking: total_memory = round( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0, 1) self.response.write( b'Memory usage: % 2.2f MB, cleaned: %d, total in-memory obs: %d' % (total_memory, num, len(gc.get_objects()))) self.response.write(b'Indexing new batch, totals: (%d %d/sec)\n' % ( self.indexed, int(self.per_sec()), )) if len(self.batch) >= self.bulk_size: await notify( IndexProgress(self.request, self.context, self.processed, (len(self.existing) + len(self.missing)))) await self.flush() async def join_futures(self): for future in self.reindex_futures: if not future.done(): await asyncio.wait_for(future, None) self.reindex_futures = [] async def _index_batch(self, batch): bulk_data = [] for _id, payload in batch.items(): doc_type = payload['data']['type_name'] action_data = { '_index': self.work_index_name, '_type': doc_type, '_id': _id } data = payload['data'] if payload['action'] == 'update': data = {'doc': data} action_data['_retry_on_conflict'] = 3 bulk_data.append({payload['action']: action_data}) if payload['action'] != 'delete': bulk_data.append(data) results = await self.utility.conn.bulk(index=self.work_index_name, doc_type=None, body=bulk_data) if results['errors']: errors = [] for result in results['items']: for key, value in result.items(): if not isinstance(value, dict): continue if 'status' in value and value['status'] != 200: _id = value.get('_id') errors.append(f'{_id}: {value["status"]}') # retry conflict errors and thread pool capacity limits if value['status'] in (409, 429): self.batch[_id] = batch[_id] logger.warning(f'Error bulk putting: {results}') async def flush(self): if len(self.batch) == 0: # nothing to flush return future = asyncio.ensure_future(self._index_batch(self.batch)) self.batch = {} self.reindex_futures.append(future) if len(self.reindex_futures) > 7: await self.join_futures() async def check_existing(self): ''' Go through self.existing and see why it wasn't processed ''' for uuid in self.existing: try: ob = await self.context._p_jar.get(uuid) except KeyError: ob = None if ob is None: # this is dumb... since we don't have the doc type on us, we # need to ask elasticsearch for it again... # elasticsearch does not allow deleting without the doc type # even though you can query for a doc without it... argh try: doc = await self.conn.get(self.work_index_name, uuid, _source=False) self.batch[uuid] = { 'action': 'delete', 'data': { 'type_name': doc['_type'] } } await self.attempt_flush() # no longer present on db, this was orphaned self.orphaned.append(uuid) except aioes.exception.NotFoundError: # it was deleted in the meantime so we're actually okay pass else: # XXX this should not happen so log it. Maybe we'll try doing something # about it another time... self.errors.append({'type': 'unprocessed', 'uuid': uuid}) # we re-query es to get full path of ob # doc = self.utility.conn.get( # await self.utility.get_index_name(), fields='path' # ) # ob = await traverse(self.request, self.container, # doc['_source']['path'].strip('/').split('/')) # await self.index_object(ob, full=True) async def setup_next_index(self): self.response.write(b'Creating new index') async with managed_transaction(self.request, write=True, adopt_parent_txn=True): await self.utility.disable_next_index(self.context, request=self.request) async with managed_transaction(self.request, write=True, adopt_parent_txn=True): self.next_index_version, self.work_index_name = await self.create_next_index( ) await self.utility.install_mappings_on_index(self.work_index_name) await self.utility.activate_next_index(self.container, self.next_index_version, request=self.request, force=self.force) async def cancel_migration(self): # canceling the migration, clearing index self.response.write('Canceling migration') async with managed_transaction(self.request, write=True, adopt_parent_txn=True): await self.utility.disable_next_index(self.context, request=self.request) self.response.write('Next index disabled') if self.active_task_id is not None: self.response.write('Canceling copy of index task') conn_es = await self.conn.transport.get_connection() async with conn_es._session.post( str(conn_es._base_url) + '_tasks/' + self.active_task_id + '/_cancel'): asyncio.sleep(5) if self.work_index_name: self.response.write('Deleting new index') await self.conn.indices.delete(self.work_index_name) self.response.write('Migration canceled') async def run_migration(self): alias_index_name = await self.utility.get_index_name( self.container, request=self.request) existing_index = await self.utility.get_real_index_name( self.container, request=self.request) await self.setup_next_index() self.mapping_diff = await self.calculate_mapping_diff() diff = json.dumps(self.mapping_diff, sort_keys=True, indent=4, separators=(',', ': ')) self.response.write(f'Caculated mapping diff: {diff}') if not self.full: # if full, we're reindexing everything does not matter what anyways, so skip self.response.write( 'Copying initial index data from existing index into new') await self.copy_to_next_index() self.response.write('Copying initial index data finished') if not self.mapping_only: self.existing = await self.get_all_uids() self.index_start_time = time.time() await self.process_object(self.context) # this is recursive await self.check_existing() await self.flush() await self.join_futures() async with self.utility._migration_lock: self.response.write('Activating new index') async with managed_transaction(self.request, write=True, adopt_parent_txn=True): await self.utility.apply_next_index(self.container, self.request) self.status = 'done' self.response.write(f'''Update alias({alias_index_name}): {existing_index} -> {self.work_index_name} ''') await self.conn.indices.update_aliases({ "actions": [{ "remove": { "alias": alias_index_name, "index": existing_index } }, { "add": { "alias": alias_index_name, "index": self.work_index_name } }] }) self.response.write('Delete old index') await self.conn.indices.close(existing_index) await self.conn.indices.delete(existing_index)