def test_delete_stale_objects_on_save(self): from relstorage.cache.local_database import Database from relstorage.cache.persistence import sqlite_connect c = self._makeOne(cache_local_dir=":memory:") conn = sqlite_connect(c.options, 'ignored-pfx', close_async=False) self.addCleanup(conn.close) db = Database.from_connection(conn) db.store_temp([(0, 0, b'state', 0)]) db.move_from_temp() self.assertEqual(dict(db.oid_to_tid), {0: 0}) conn.commit() # Pretend we loaded this from the db c[(0, 0)] = (b'state', 0) c._min_allowed_writeback[0] = 0 # Pass a newer version through c[(0, 1)] = (b'state', 1) self.assertEqual(c._min_allowed_writeback[0], 1) # Evict it so we don't have it to write. del c._bucket0[(0, 1)] # But it gets removed based on having seen it and knowing # it's there. conn._rs_has_closed = True # Prevent closing by the write method try: c.write_to_sqlite(conn) finally: conn._rs_has_closed = False self.assertEmpty(db.oid_to_tid)
def _makeOne(self): db = Database.from_connection( self.connection, use_upsert=self.USE_UPSERT, ) ost = db.store_temp def store_temp(rows): return ost([(r[0], r[1], 0, r[2], r[3]) for r in rows]) db.store_temp = store_temp db.real_store_temp = ost return db
def remove_invalid_persistent_oids(self, bad_oids): """ Remove data from the persistent cache for the given oids. """ options = self.options if not options.cache_local_dir: return count_removed = 0 conn = '(no oids to remove)' if bad_oids: conn = sqlite_connect(options, self.prefix, close_async=False) with closing(conn): db = Database.from_connection(conn) count_removed = db.remove_invalid_persistent_oids(bad_oids) logger.debug("Removed %d invalid OIDs from %s", count_removed, conn)
def read_from_sqlite(self, connection): import gc gc.collect() # Free memory, we're about to make big allocations. mem_before = get_memory_usage() db = Database.from_connection(connection) checkpoints = db.checkpoints @_log_timed def fetch_and_filter_rows(): # Do this here so that we don't have the result # in a local variable and it can be collected # before we measure the memory delta. # # In large benchmarks, this function accounts for 57% # of the total time to load data. 26% of the total is # fetching rows from sqlite, and 18% of the total is allocating # storage for the blob state. # # We make one call into sqlite and let it handle the iterating. # Items are (oid, key_tid, state, actual_tid). # key_tid may equal the actual tid, or be -1 when the row was previously # frozen; # That doesn't matter to us, we always freeze all rows. size = 0 limit = self.limit items = [] rows = db.fetch_rows_by_priority() for oid, frozen, state, actual_tid, frequency in rows: size += len(state) if size > limit: break items.append((oid, (state, actual_tid, frozen, frequency))) consume(rows) # Rows came to us MRU to LRU, but we need to feed them the other way. items.reverse() return items # In the large benchmark, this is 25% of the total time. # 18% of the total time is preallocating the entry nodes. self._bulk_update(fetch_and_filter_rows(), source=connection, mem_usage_before=mem_before) return checkpoints
def _makeOne(self, conn=None): return Database.from_connection(conn or self.connection, )
def _makeOne(self): return Database.from_connection( self.connection, use_upsert=self.USE_UPSERT, )
def write_to_sqlite(self, connection): # pylint:disable=too-many-locals mem_before = get_memory_usage() cur = connection.cursor() db = Database.from_connection(connection) begin = time.time() # In a large benchmark, store_temp() accounts for 32% # of the total time, while move_from_temp accounts for 49%. # The batch size depends on how many params a stored proc can # have; if we go too big we get OperationalError: too many SQL # variables. The default is 999. # Note that the multiple-value syntax was added in # 3.7.11, 2012-03-20. with _timer() as batch_timer: cur.execute('BEGIN') stored_oid_tid = db.oid_to_tid fetch_current = time.time() count_written, _ = db.store_temp( self._items_to_write(stored_oid_tid)) cur.execute("COMMIT") # Take out (reserved write) locks now; if we don't, we can get # 'OperationalError: database is locked' (the SQLITE_BUSY # error code; the SQLITE_LOCKED error code is about table # locks and has the string 'database table is locked') But # beginning immediate lets us stand in line. # # Note that the SQLITE_BUSY error on COMMIT happens "if an # another thread or process has a shared lock on the database # that prevented the database from being updated.", But "When # COMMIT fails in this way, the transaction remains active and # the COMMIT can be retried later after the reader has had a # chance to clear." So we could retry the commit a couple # times and give up if can't get there. It looks like one # production database takes about 2.5s to execute this entire function; # it seems like most instances that are shutting down get to this place # at roughly the same time and stack up waiting: # # Instance | Save Time | write_to_sqlite time # 1 | 2.36s | 2.35s # 2 | 5.31s | 5.30s # 3 | 6.51s | 6.30s # 4 | 7.91s | 7.90s # # That either suggests that all waiting happens just to acquire this lock and # commit this transaction (and that subsequent operations are inconsequential # in terms of time) OR that the exclusive lock isn't truly getting dropped, # OR that some function in subsequent processes is taking longer and longer. # And indeed, our logging showed that a gc.collect() at the end of this function was # taking more and more time: # # Instance | GC Time | Reported memory usage after GC # 1 | 2.00s | 34.4MB # 2 | 3.50s | 83.2MB # 3 | 4.94s | 125.9MB # 4 | 6.44 | 202.1MB # # The exclusive lock time did vary, but not as much; trim time # didn't really vary: # # Instance | Exclusive Write Time | Batch Insert Time | Fetch Current # 1 | 0.09s | 0.12s | 0.008s # 2 | 1.19s | 0.44s | 0.008s # 3 | 0.91s | 0.43s | 0.026s # 4 | 0.92s | 0.34s | 0.026s with _timer() as exclusive_timer: cur.execute('BEGIN IMMEDIATE') # During the time it took for us to commit, and the time that we # got the lock, it's possible that someone else committed and # changed the data in object_state # Things might have changed in the database since our # snapshot where we put temp objects in, so we can't rely on # just assuming that's the truth anymore. rows_inserted = db.move_from_temp() if self.checkpoints: db.update_checkpoints(*self.checkpoints) cur.execute('COMMIT') # TODO: Maybe use BTrees.family.intersection to get the common keys? # Or maybe use a SQLite Python function? # 'DELETE from object_state where is_stale(zoid, tid)' # What's fastest? The custom function version would require scanning the entire # table; unfortunately this sqlite interface doesn't allow defining virtual tables. with _timer() as trim_timer: # Delete anything we still know is stale, because we # saw a new TID for it, but we had nothing to replace it with. min_allowed_writeback = OID_TID_MAP_TYPE() for k, v in self._min_allowed_writeback.items(): if stored_oid_tid.get(k, MAX_TID) < v: min_allowed_writeback[k] = v db.trim_to_size(self.limit, min_allowed_writeback) del min_allowed_writeback # Typically we write when we're closing and don't expect to do # anything else, so there's no need to keep tracking this info. self._min_allowed_writeback = OID_TID_MAP_TYPE() # Cleanups cur.close() del cur db.close() # closes the connection. del db del stored_oid_tid # We're probably shutting down, don't perform a GC; we see # that can get quite lengthy. end = time.time() stats = self.stats() mem_after = get_memory_usage() logger.info( "Wrote %d items to %s in %s " "(%s to fetch current, %s to insert batch (%d rows), %s to write, %s to trim). " "Used %s memory. (Storage: %s) " "Total hits %s; misses %s; ratio %s (stores: %s)", count_written, connection, end - begin, fetch_current - begin, batch_timer.duration, rows_inserted, exclusive_timer.duration, trim_timer.duration, byte_display(mem_after - mem_before), self._bucket0, stats['hits'], stats['misses'], stats['ratio'], stats['sets']) return count_written
def read_from_sqlite(self, connection, row_filter=None): import gc gc.collect() # Free memory, we're about to make big allocations. mem_before = get_memory_usage() db = Database.from_connection(connection) checkpoints = db.checkpoints if checkpoints: self.store_checkpoints(*checkpoints) self._min_allowed_writeback = db.oid_to_tid # XXX: The cache_ring is going to consume all the items we # give it, even if it doesn't actually add them to the cache. # It also creates a very large preallocated array for all to # hold the `total_count` of items. We don't want to read more # rows than necessary, to keep the delta maps small and to # avoid the overhead; we could pass the COUNT(zoid) to # `bulk_update`, and have our row iterable be a generator that # reads a row and breaks when it reaches the limit, but then # we have that whole preallocated array hanging around, which # is probably much bigger than we need. So we need to give an # accurate count, and the easiest way to do that is to materialize # the list of rows. # # In practice, we can assume that the limit hasn't changed between this # run and the last run, so the database will already have been trimmed # to fit the desired size, so essentially all the rows in the database # should go in our cache. @_log_timed def fetch_and_filter_rows(): # Do this here so that we don't have the result # in a local variable and it can be collected # before we measure the memory delta. # # In large benchmarks, this function accounts for 57% # of the total time to load data. 26% of the total is # fetching rows from sqlite, and 18% of the total is allocating # storage for the blob state. # # We make one call into sqlite and let it handle the iterating. # Items are (oid, key_tid, state, actual_tid). Currently, # key_tid == actual_tid items = db.list_rows_by_priority() # row_filter produces the sequence ((oid, key_tid) (state, actual_tid)) if row_filter is not None: row_iter = row_filter(checkpoints, items) items = list(row_iter) else: items = [(r[:2], r[2:]) for r in items] # Look at them from most to least recent, # but insert them the other way. items.reverse() return items f = fetch_and_filter_rows.__wrapped__ f.__name__ = f.__name__ + ':' + (str(row_filter) if row_filter is not None else '<no filter>') # In the large benchmark, this is 25% of the total time. # 18% of the total time is preallocating the entry nodes. self.__bucket.bulk_update(fetch_and_filter_rows(), source=connection, log_count=len(self._min_allowed_writeback), mem_usage_before=mem_before)