async def get_page_from_tid(self): conn = await self.txn.get_connection() clear_conn_statement_cache(conn) records = [] queried_tid = self.last_tid async with self.txn._lock: start = timeit.default_timer() results = await conn.fetch(GET_OBS_BY_TID, queried_tid, timeout=TIMEOUT_PERIOD) stop = timeit.default_timer() duration = stop - start logger.warning( f"----Duration to fetch Objects by TID {duration:.2f}s") for record in results: if record['zoid'] in (ROOT_ID, TRASHED_ID, self.container._p_oid): continue records.append(record) self.last_tid = record['tid'] self.last_zoid = record['zoid'] if len(records) == 0: if len(self.last_result_set) > 0: # now we have zero, increment, but only once self.last_tid = self.last_tid + 1 print(f'Incremented last tid by one') self.last_result_set = records return records
async def process_object(self, ob): ''' - check if doc does not exist - record it - do complete index - if doc exist - if diff mapping exists - update fields in diff on doc - else, do nothing - remove for list of existing doc ids ''' clear_conn_statement_cache(await ob._p_jar.get_connection()) full = False if ob.uuid not in self.existing: self.missing.append(ob.uuid) full = True else: self.existing.remove(ob.uuid) await self.index_object(ob, full=full) self.processed += 1 if IFolder.providedBy(ob): await self.process_folder(ob) if not IContainer.providedBy(ob): del ob.__annotations__ del ob
async def get_db_page_of_keys(self, oids, page=1, page_size=PAGE_SIZE): conn = await self.txn.get_connection() clear_conn_statement_cache(conn) async with self.txn._lock: return await conn.fetch(BATCHED_GET_CHILDREN_BY_PARENT, oids, page_size, (page - 1) * page_size, timeout=TIMEOUT_PERIOD)
async def get_db_page_of_keys(self, oids, page=1, page_size=PAGE_SIZE): conn = await self.txn.get_connection() clear_conn_statement_cache(conn) keys = [] async with self.txn._lock: for record in await conn.fetch( BATCHED_GET_CHILDREN_BY_PARENT, oids, page_size, (page - 1) * page_size): keys.append(record['zoid']) return keys
async def check_orphans(self): logger.warning( f'Checking orphans on container {self.container.id}', extra={ # noqa 'account': self.container.id }) conn = await self.txn.get_connection() checked = 0 async for es_batch, index_name in self.iter_batched_es_keys(): checked += len(es_batch) clear_conn_statement_cache(conn) async with self.txn._lock: records = await conn.fetch(SELECT_BY_KEYS, es_batch) db_batch = set() for record in records: db_batch.add(record['zoid']) orphaned = [k for k in set(es_batch) - db_batch] if checked % 10000 == 0: logger.warning(f'Checked ophans: {checked}') if orphaned: # these are keys that are in ES but not in DB so we should # remove them.. self.orphaned |= set(orphaned) logger.warning(f'deleting orphaned {len(orphaned)}') conn_es = await self.utility.conn.transport.get_connection() # delete by query for orphaned keys... async with conn_es.session.post( join(conn_es.base_url.human_repr(), index_name, '_delete_by_query'), headers={'Content-Type': 'application/json'}, data=json.dumps({"query": { "terms": { "_id": orphaned } }})) as resp: # noqa try: data = await resp.json() if data['deleted'] != len(orphaned): logger.warning( f'Was only able to clean up {len(data["deleted"])} ' # noqa f'instead of {len(orphaned)}') except Exception: logger.warning( 'Could not parse delete by query response. ' 'Vacuuming might not be working')
async def check_orphans(self): logger.warning(f'Checking orphans on container {self.container.id}', extra={'account': self.container.id}) conn = await self.txn.get_connection() checked = 0 async for es_batch in self.iter_batched_es_keys(): checked += len(es_batch) clear_conn_statement_cache(conn) async with self.txn._lock: records = await conn.fetch(SELECT_BY_KEYS, es_batch, timeout=TIMEOUT_PERIOD) db_batch = set() for record in records: db_batch.add(record['zoid']) orphaned = [k for k in set(es_batch) - db_batch] if checked % 10000 == 0: logger.warning(f'Checked ophans: {checked}') if orphaned: # these are keys that are in ES but not in DB so we should # remove them.. self.orphaned |= set(orphaned) logger.warning(f'deleting orphaned {len(orphaned)}') conn_es = await self.utility.conn.transport.get_connection() # delete by query for orphaned keys... async with conn_es._session.post( '{}{}/_delete_by_query'.format( conn_es._base_url.human_repr(), self.index_name), data=json.dumps({'query': { 'terms': { '_id': orphaned } }})) as resp: try: data = await resp.json() if data['deleted'] != len(orphaned): logger.warning( f'Was only able to clean up {len(data["deleted"])} ' f'instead of {len(orphaned)}') except Exception: logger.warning( 'Could not parse delete by query response. ' 'Vacuuming might not be working')
async def get_page_from_tid(self): conn = await self.txn.get_connection() clear_conn_statement_cache(conn) keys = [] queried_tid = self.last_tid async with self.txn._lock: records = await conn.fetch(GET_OBS_BY_TID, queried_tid) for record in records: if record['zoid'] in (ROOT_ID, TRASHED_ID, self.container._p_oid): continue keys.append(record['zoid']) self.last_tid = record['tid'] self.last_zoid = record['zoid'] if len(keys) == 0: if len(self.last_result_set) > 0: # now we have zero, increment, but only once self.last_tid = self.last_tid + 1 self.last_result_set = keys return keys
async def iter_paged_db_keys(self, oids): if self.use_tid_query: queried_tid = self.last_tid records = await self.get_page_from_tid() while len(records) > 0: yield records if self.last_tid == queried_tid: conn = await self.txn.get_connection() logger.warning( f'Getting all keys from tid {self.last_tid}') # we're stuck on same tid, get all for this tid # and then move on... clear_conn_statement_cache(conn) results = await conn.fetch(GET_ALL_FOR_TID, self.last_tid, self.last_zoid) while len(results) > 0: records = [] for record in results: if record['zoid'] in (ROOT_ID, TRASHED_ID, self.container._p_oid): continue records.append(record) self.last_zoid = record['zoid'] yield records clear_conn_statement_cache(conn) results = await conn.fetch(GET_ALL_FOR_TID, self.last_tid, self.last_zoid) self.last_tid = self.last_tid + 1 queried_tid = self.last_tid records = await self.get_page_from_tid() else: page_num = 1 page = await self.get_db_page_of_keys(oids, page_num) while page: yield page async for sub_page in self.iter_paged_db_keys( [r['zoid'] for r in page]): yield sub_page page_num += 1 page = await self.get_db_page_of_keys(oids, page_num)
async def get_object(self, oid): if oid in self.cache: return self.cache[oid] try: result = self.txn._manager._hard_cache.get(oid, None) except AttributeError: from guillotina.db.transaction import HARD_CACHE # pylint: disable=E0611 result = HARD_CACHE.get(oid, None) if result is None: clear_conn_statement_cache(await self.txn.get_connection()) result = await self.txn._cache.get(oid=oid) if result is None: result = await self.tm._storage.load(self.txn, oid) obj = reader(result) obj._p_jar = self.txn if result['parent_id']: obj.__parent__ = await self.get_object(result['parent_id']) return obj
async def check_orphans(self): logger.warning(f'Checking orphans on container {self.container.id}', extra={ 'account': self.container.id }) conn = await self.txn.get_connection() checked = 0 async for es_batch in self.iter_batched_es_keys(): checked += len(es_batch) clear_conn_statement_cache(conn) async with self.txn._lock: records = await conn.fetch(SELECT_BY_KEYS, es_batch) db_batch = set() for record in records: db_batch.add(record['zoid']) orphaned = [k for k in set(es_batch) - db_batch] if checked % 10000 == 0: logger.warning(f'Checked ophans: {checked}') if orphaned: # these are keys that are in ES but not in DB so we should # remove them.. self.orphaned.extend(orphaned) logger.warning(f'deleting orphaned {len(orphaned)}') conn_es = await self.utility.conn.transport.get_connection() # delete by query for orphaned keys... async with conn_es.session.post( join(conn_es.base_url.human_repr(), self.index_name, '_delete_by_query'), headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'query': { 'terms': { 'uuid': orphaned } } })) as resp: pass
async def iter_paged_db_keys(self, oids): if self.use_tid_query: queried_tid = self.last_tid records = await self.get_page_from_tid() while len(records) > 0 or retry: try: yield records retry = False if self.last_tid == queried_tid: conn = await self.txn.get_connection() logger.warning( f'Getting all keys for tid {self.last_tid}') # we're stuck on same tid, get all for this tid # and then move on... clear_conn_statement_cache(conn) start = timeit.default_timer() results = await conn.fetch(GET_ALL_FOR_TID, self.last_tid, self.last_zoid, timeout=TIMEOUT_PERIOD) stop = timeit.default_timer() duration = stop - start logger.warning(f" Got all for TID in {duration:.2f}s") while len(results) > 0: records = [] for record in results: if record['zoid'] in (ROOT_ID, TRASHED_ID, self.container._p_oid): continue records.append(record) self.last_zoid = record['zoid'] yield records clear_conn_statement_cache(conn) start = timeit.default_timer() results = await conn.fetch(GET_ALL_FOR_TID, self.last_tid, self.last_zoid, timeout=TIMEOUT_PERIOD) stop = timeit.default_timer() duration = stop - start logger.warning( f"More results - Got all for TID in {duration:.2f}s" ) self.last_tid = self.last_tid + 1 queried_tid = self.last_tid records = await self.get_page_from_tid() except Exception: logger.error('Could not get keys for tid, retrying...', exc_info=True) retry = True else: page_num = 1 page = await self.get_db_page_of_keys(oids, page_num) while page: yield page async for sub_page in self.iter_paged_db_keys( [r['zoid'] for r in page]): yield sub_page page_num += 1 page = await self.get_db_page_of_keys(oids, page_num)