def database_update_cycle(database_filename): """Perform a single cycle of the database update loop.""" with start_timer() as timer: highest_oeis_id = find_highest_oeis_id( ) # Check OEIS server for highest entry ID. with close_when_done(sqlite3.connect(database_filename)) as dbconn: ensure_database_schema_created(dbconn) make_database_complete( dbconn, highest_oeis_id ) # Make sure we have all entries (full fetch on first run). update_database_entries_randomly( dbconn, highest_oeis_id // 1000) # Refresh 0.1 % of entries randomly. update_database_entries_by_priority( dbconn, highest_oeis_id // 200) # Refresh 0.5 % of entries by priority. update_database_entries_for_nonzero_time_window( dbconn ) # Make sure we have t1 != t2 for all entries (full fetch on first run). consolidate_database_monthly(database_filename, remove_stale_files_flag=False) logger.info("Full database update cycle took {}.".format( timer.duration_string()))
def update_database_entries_by_priority(dbconn, howmany): """Re-fetch entries that are old, relative to their stability. For each entry, a priority is determined, as follows: The _age_ is defined as the number of seconds ago that the entry was last fetched in its current state. The _stability_ is defined as the number of seconds between the first and last fetches in the current state. The _priority_ is the _age_ divided by the _stability_. A high priority indicates that the entry is old and/or unstable. Such entries are fetched in preference to entries that are recent and/or stable (and have a lower priority). """ t_current = time.time() with close_when_done(dbconn.cursor()) as dbcursor: query = "SELECT oeis_id FROM oeis_entries ORDER BY (? - t2) / max(t2 - t1, 1e-6) DESC LIMIT ?;" dbcursor.execute(query, (t_current, howmany)) highest_priority_entries = dbcursor.fetchall() highest_priority_entries = [oeis_id for (oeis_id, ) in highest_priority_entries] logger.info("Highest-priority entries in local database selected for refresh: {}.".format(len(highest_priority_entries))) fetch_entries_into_database(dbconn, highest_priority_entries)
def update_database_entries_by_priority(dbconn, howmany): """Re-fetch entries that are old, relative to their stability. For each entry, a priority is determined, as follows: The _age_ is defined as the number of seconds ago that the entry was last fetched in its current state. The _stability_ is defined as the number of seconds between the first and last fetches in the current state. The _priority_ is the _age_ divided by the _stability_. A high priority indicates that the entry is old and/or unstable. Such entries are fetched in preference to entries that are recent and/or stable (and have a lower priority). """ t_current = time.time() with close_when_done(dbconn.cursor()) as dbcursor: query = "SELECT oeis_id FROM oeis_entries ORDER BY (? - t2) / max(t2 - t1, 1e-6) DESC LIMIT ?;" dbcursor.execute(query, (t_current, howmany)) highest_priority_entries = dbcursor.fetchall() highest_priority_entries = [ oeis_id for (oeis_id, ) in highest_priority_entries ] logger.info( "Highest-priority entries in local database selected for refresh: {}.". format(len(highest_priority_entries))) fetch_entries_into_database(dbconn, highest_priority_entries)
def process_responses(dbconn, responses): """Process a batch of responses by updating the local SQLite database. A logging message is produced that summarizes how the batch of responses was processed. This function returns a set of OEIS IDs that have been succesfully processed. """ countFailures = 0 countNewEntries = 0 countIdenticalEntries = 0 countUpdatedEntries = 0 processed_entries = set() with close_when_done(dbconn.cursor()) as dbcursor: for response in responses: if response is None: # Skip entries that are not okay. # Do not record the failures in the processed_entries set. countFailures += 1 continue query = "SELECT main_content, bfile_content FROM oeis_entries WHERE oeis_id = ?;" dbcursor.execute(query, (response.oeis_id, )) previous_content = dbcursor.fetchall() assert len(previous_content) <= 1 previous_content = None if len(previous_content) == 0 else previous_content[0] if previous_content is None: # The oeis_id does not occur in the database yet. # We will insert it as a new entry. query = "INSERT INTO oeis_entries(oeis_id, t1, t2, main_content, bfile_content) VALUES (?, ?, ?, ?, ?);" dbcursor.execute(query, (response.oeis_id, response.timestamp, response.timestamp, response.main_content, response.bfile_content)) countNewEntries += 1 elif previous_content != (response.main_content, response.bfile_content): # The database content is stale. # Update t1, t2, and content. query = "UPDATE oeis_entries SET t1 = ?, t2 = ?, main_content = ?, bfile_content = ? WHERE oeis_id = ?;" dbcursor.execute(query, (response.timestamp, response.timestamp, response.main_content, response.bfile_content, response.oeis_id)) countUpdatedEntries += 1 else: # The database content is identical to the freshly fetched content. # We will just update the t2 field, indicating the fresh fetch. query = "UPDATE oeis_entries SET t2 = ? WHERE oeis_id = ?;" dbcursor.execute(query, (response.timestamp, response.oeis_id)) countIdenticalEntries += 1 processed_entries.add(response.oeis_id) dbconn.commit() logger.info("Processed {} responses (failures: {}, new: {}, identical: {}, updated: {}).".format(len(responses), countFailures, countNewEntries, countIdenticalEntries, countUpdatedEntries)) return processed_entries
def process_database_entries(database_filename_in, dirname_out): if not os.path.exists(database_filename_in): logger.critical("Database file '{}' not found! Unable to continue.".format(database_filename_in)) return (root, ext) = os.path.splitext(database_filename_in) if os.path.exists(dirname_out): logger.info("Removing stale directory '{}' ...".format(dirname_out)) shutil.rmtree(dirname_out) # ========== fetch and process database entries, ordered by oeis_id. BATCH_SIZE = 1000 with start_timer() as timer: with close_when_done(sqlite3.connect(database_filename_in)) as dbconn_in, close_when_done(dbconn_in.cursor()) as dbcursor_in: dbcursor_in.execute("SELECT oeis_id, t1, t2, main_content, bfile_content FROM oeis_entries ORDER BY oeis_id;") while True: oeis_entries = dbcursor_in.fetchmany(BATCH_SIZE) if len(oeis_entries) == 0: break logger.log(logging.PROGRESS, "Processing OEIS entries A{:06} to A{:06} ...".format(oeis_entries[0][0], oeis_entries[-1][0])) for (oeis_id, t1, t2, main_content, bfile_content) in oeis_entries: directory = os.path.join(dirname_out, "A{:03d}xxx".format(oeis_id // 1000), "A{:06d}".format(oeis_id)) os.makedirs(directory) with open(os.path.join(directory, "metadata.json"), "w") as f: metadata = [oeis_id, t1, t2] json.dump(metadata, f) with open(os.path.join(directory, "main_content.txt"), "w") as f: f.write(main_content) with open(os.path.join(directory, "bfile_content.txt"), "w") as f: f.write(bfile_content) logger.info("Processed all database entries in {}.".format(timer.duration_string()))
def process_database_entries(database_filename_in): if not os.path.exists(database_filename_in): logger.critical("Database file '{}' not found! Unable to continue.".format(database_filename_in)) return (root, ext) = os.path.splitext(database_filename_in) database_filename_out = root + "_parsed" + ext if os.path.exists(database_filename_out): logger.info("Removing stale file '{}' ...".format(database_filename_out)) os.remove(database_filename_out) # ========== fetch and process database entries, ordered by oeis_id. BATCH_SIZE = 1000 with start_timer() as timer: with close_when_done(sqlite3.connect(database_filename_in)) as dbconn_in, close_when_done(dbconn_in.cursor()) as dbcursor_in: with close_when_done(sqlite3.connect(database_filename_out)) as dbconn_out, close_when_done(dbconn_out.cursor()) as dbcursor_out: create_database_schema(dbconn_out) with concurrent.futures.ProcessPoolExecutor() as pool: dbcursor_in.execute("SELECT oeis_id, main_content, bfile_content FROM oeis_entries ORDER BY oeis_id;") while True: oeis_entries = dbcursor_in.fetchmany(BATCH_SIZE) if len(oeis_entries) == 0: break logger.log(logging.PROGRESS, "Processing OEIS entries A{:06} to A{:06} ...".format(oeis_entries[0][0], oeis_entries[-1][0])) query = "INSERT INTO oeis_entries(oeis_id, identification, value_list, name, comments, detailed_references, links, formulas, examples, maple_programs, mathematica_programs, other_programs, cross_references, keywords, offset_a, offset_b, author, extensions_and_errors) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);" dbcursor_out.executemany(query, pool.map(process_oeis_entry, oeis_entries)) dbconn_out.commit() logger.info("Processed all database entries in {}.".format(timer.duration_string()))
def solve_linear_recurrences(database_filename_in, terms, exclude_entries = None): if not os.path.exists(database_filename_in): logger.critical("Database file '{}' not found! Unable to continue.".format(database_filename_in)) return if exclude_entries is None: exclude_entries = frozenset() # ========== fetch and process database entries, ordered by oeis_id. BATCH_SIZE = 1000 with start_timer() as timer: with close_when_done(sqlite3.connect(database_filename_in)) as dbconn_in, close_when_done(dbconn_in.cursor()) as dbcursor_in: with concurrent.futures.ProcessPoolExecutor() as pool: dbcursor_in.execute("SELECT oeis_id, main_content, bfile_content FROM oeis_entries ORDER BY oeis_id;") while True: oeis_entries = dbcursor_in.fetchmany(BATCH_SIZE) if len(oeis_entries) == 0: break logger.log(logging.PROGRESS, "Processing OEIS entries A{:06} to A{:06} ...".format(oeis_entries[0][0], oeis_entries[-1][0])) work = [(oeis_id, main_content, bfile_content, terms) for (oeis_id, main_content, bfile_content) in oeis_entries if "A{:06d}".format(oeis_id) not in exclude_entries] for (oeis_entry, solution) in pool.map(process_oeis_entry, work): if solution is not None: yield (str(oeis_entry), solution) logger.info("Processed all database entries in {}.".format(timer.duration_string()))
def update_database_entries_randomly(dbconn, howmany): """Re-fetch (update) a random subset of entries that are already present in the local SQLite database.""" with close_when_done(dbconn.cursor()) as dbcursor: dbcursor.execute("SELECT oeis_id FROM oeis_entries;") present_entries = dbcursor.fetchall() present_entries = [oeis_id for (oeis_id, ) in present_entries] random_entries_count = min(howmany, len(present_entries)) random_entries = random.sample(present_entries, random_entries_count) logger.info("Random entries in local database selected for refresh: {}.".format(len(random_entries))) fetch_entries_into_database(dbconn, random_entries)
def make_database_complete(dbconn, highest_oeis_id): """Fetch all entries from the remote OEIS database that are not yet present in the local SQLite database.""" with close_when_done(dbconn.cursor()) as dbcursor: dbcursor.execute("SELECT oeis_id FROM oeis_entries;") present_entries = dbcursor.fetchall() present_entries = [oeis_id for (oeis_id, ) in present_entries] logger.info("Entries present in local database: {}.".format(len(present_entries))) all_entries = range(1, highest_oeis_id + 1) missing_entries = set(all_entries) - set(present_entries) logger.info("Missing entries to be fetched: {}.".format(len(missing_entries))) fetch_entries_into_database(dbconn, missing_entries)
def consolidate_database_monthly(database_filename, remove_stale_files_flag): """Make a consolidated version of the database, once per day. The consolidated version will have a standardized filename 'oeis_vYYYYMMDD.sqlite3.xz'. If this filename already exists, we return immediately. If not, we vacuum the database, and compress its file. This process takes ~ 2 hours on a fast desktop PC. When the compressed database is written, we remove all 'stale' consolidated files, i.e., all files that are called 'oeis_vYYYYMMDD.sqlite3.xz' except the one we just wrote. """ now = datetime.datetime.now() if now.day != 1: return xz_filename = now.strftime("oeis_v%Y%m%d.sqlite3.xz") if os.path.exists(xz_filename): return # file already exists. with start_timer() as timer: logger.info("Consolidating database to '{}' ...".format(xz_filename)) # Vacuum the database with close_when_done(sqlite3.connect(database_filename)) as dbconn: vacuum_database(dbconn) # Create the xz file. compress_file(database_filename, xz_filename) # Remove stale files. if remove_stale_files_flag: stale_files = [ filename for filename in glob.glob("oeis_v????????.sqlite3.xz") if filename != xz_filename ] for filename in stale_files: logger.info("Removing stale consolidated file '{}' ...".format( filename)) os.remove(filename) logger.info("Consolidating data took {}.".format( timer.duration_string()))
def database_update_cycle(database_filename): """Perform a single cycle of the database update loop.""" with start_timer() as timer: highest_oeis_id = find_highest_oeis_id() # Check OEIS server for highest entry ID. with close_when_done(sqlite3.connect(database_filename)) as dbconn: ensure_database_schema_created(dbconn) make_database_complete(dbconn, highest_oeis_id) # Make sure we have all entries (full fetch on first run). update_database_entries_randomly(dbconn, highest_oeis_id // 1000) # Refresh 0.1 % of entries randomly. update_database_entries_by_priority(dbconn, highest_oeis_id // 200) # Refresh 0.5 % of entries by priority. update_database_entries_for_nonzero_time_window(dbconn) # Make sure we have t1 != t2 for all entries (full fetch on first run). consolidate_database_monthly(database_filename, remove_stale_files_flag = False) logger.info("Full database update cycle took {}.".format(timer.duration_string()))
def update_database_entries_for_nonzero_time_window(dbconn): """ Re-fetch entries in the database that have a 0-second time window. These are entries that have been fetched only once.""" while True: with close_when_done(dbconn.cursor()) as dbcursor: dbcursor.execute("SELECT oeis_id FROM oeis_entries WHERE t1 = t2;") zero_timewindow_entries = dbcursor.fetchall() if len(zero_timewindow_entries) == 0: break # no zero-timewindow entries. zero_timewindow_entries = [oeis_id for (oeis_id, ) in zero_timewindow_entries] logger.info("Entries with zero time window in local database selected for refresh: {}.".format(len(zero_timewindow_entries))) fetch_entries_into_database(dbconn, zero_timewindow_entries)
def update_database_entries_randomly(dbconn, howmany): """Re-fetch (update) a random subset of entries that are already present in the local SQLite database.""" with close_when_done(dbconn.cursor()) as dbcursor: dbcursor.execute("SELECT oeis_id FROM oeis_entries;") present_entries = dbcursor.fetchall() present_entries = [oeis_id for (oeis_id, ) in present_entries] random_entries_count = min(howmany, len(present_entries)) random_entries = random.sample(present_entries, random_entries_count) logger.info( "Random entries in local database selected for refresh: {}.".format( len(random_entries))) fetch_entries_into_database(dbconn, random_entries)
def make_database_complete(dbconn, highest_oeis_id): """Fetch all entries from the remote OEIS database that are not yet present in the local SQLite database.""" with close_when_done(dbconn.cursor()) as dbcursor: dbcursor.execute("SELECT oeis_id FROM oeis_entries;") present_entries = dbcursor.fetchall() present_entries = [oeis_id for (oeis_id, ) in present_entries] logger.info("Entries present in local database: {}.".format( len(present_entries))) all_entries = range(1, highest_oeis_id + 1) missing_entries = set(all_entries) - set(present_entries) logger.info("Missing entries to be fetched: {}.".format( len(missing_entries))) fetch_entries_into_database(dbconn, missing_entries)
def consolidate_database_monthly(database_filename, remove_stale_files_flag): """Make a consolidated version of the database, once per day. The consolidated version will have a standardized filename 'oeis_vYYYYMMDD.sqlite3.xz'. If this filename already exists, we return immediately. If not, we vacuum the database, and compress its file. This process takes ~ 2 hours on a fast desktop PC. When the compressed database is written, we remove all 'stale' consolidated files, i.e., all files that are called 'oeis_vYYYYMMDD.sqlite3.xz' except the one we just wrote. """ now = datetime.datetime.now() if now.day != 1: return xz_filename = now.strftime("oeis_v%Y%m%d.sqlite3.xz") if os.path.exists(xz_filename): return # file already exists. with start_timer() as timer: logger.info("Consolidating database to '{}' ...".format(xz_filename)) # Vacuum the database with close_when_done(sqlite3.connect(database_filename)) as dbconn: vacuum_database(dbconn) # Create the xz file. compress_file(database_filename, xz_filename) # Remove stale files. if remove_stale_files_flag: stale_files = [filename for filename in glob.glob("oeis_v????????.sqlite3.xz") if filename != xz_filename] for filename in stale_files: logger.info("Removing stale consolidated file '{}' ...".format(filename)) os.remove(filename) logger.info("Consolidating data took {}.".format(timer.duration_string()))
def update_database_entries_for_nonzero_time_window(dbconn): """ Re-fetch entries in the database that have a 0-second time window. These are entries that have been fetched only once.""" while True: with close_when_done(dbconn.cursor()) as dbcursor: dbcursor.execute("SELECT oeis_id FROM oeis_entries WHERE t1 = t2;") zero_timewindow_entries = dbcursor.fetchall() if len(zero_timewindow_entries) == 0: break # no zero-timewindow entries. zero_timewindow_entries = [ oeis_id for (oeis_id, ) in zero_timewindow_entries ] logger.info( "Entries with zero time window in local database selected for refresh: {}." .format(len(zero_timewindow_entries))) fetch_entries_into_database(dbconn, zero_timewindow_entries)
def process_responses(dbconn, responses): """Process a batch of responses by updating the local SQLite database. A logging message is produced that summarizes how the batch of responses was processed. This function returns a set of OEIS IDs that have been succesfully processed. """ countFailures = 0 countNewEntries = 0 countIdenticalEntries = 0 countUpdatedEntries = 0 processed_entries = set() with close_when_done(dbconn.cursor()) as dbcursor: for response in responses: if response is None: # Skip entries that are not okay. # Do not record the failures in the processed_entries set. countFailures += 1 continue query = "SELECT main_content, bfile_content FROM oeis_entries WHERE oeis_id = ?;" dbcursor.execute(query, (response.oeis_id, )) previous_content = dbcursor.fetchall() assert len(previous_content) <= 1 previous_content = None if len( previous_content) == 0 else previous_content[0] if previous_content is None: # The oeis_id does not occur in the database yet. # We will insert it as a new entry. query = "INSERT INTO oeis_entries(oeis_id, t1, t2, main_content, bfile_content) VALUES (?, ?, ?, ?, ?);" dbcursor.execute( query, (response.oeis_id, response.timestamp, response.timestamp, response.main_content, response.bfile_content)) countNewEntries += 1 elif previous_content != (response.main_content, response.bfile_content): # The database content is stale. # Update t1, t2, and content. query = "UPDATE oeis_entries SET t1 = ?, t2 = ?, main_content = ?, bfile_content = ? WHERE oeis_id = ?;" dbcursor.execute(query, (response.timestamp, response.timestamp, response.main_content, response.bfile_content, response.oeis_id)) countUpdatedEntries += 1 else: # The database content is identical to the freshly fetched content. # We will just update the t2 field, indicating the fresh fetch. query = "UPDATE oeis_entries SET t2 = ? WHERE oeis_id = ?;" dbcursor.execute(query, (response.timestamp, response.oeis_id)) countIdenticalEntries += 1 processed_entries.add(response.oeis_id) dbconn.commit() logger.info( "Processed {} responses (failures: {}, new: {}, identical: {}, updated: {})." .format(len(responses), countFailures, countNewEntries, countIdenticalEntries, countUpdatedEntries)) return processed_entries
def show_entries(database_filename): if not os.path.exists(database_filename): logger.critical("Database file '{}' not found! Unable to continue.".format(database_filename)) return t_current = time.time() with close_when_done(sqlite3.connect(database_filename)) as dbconn, close_when_done(dbconn.cursor()) as dbcursor: query = "SELECT oeis_id, t1, t2 FROM oeis_entries;" dbcursor.execute(query) data = dbcursor.fetchall() data_dtype = np.dtype([ ("oeis_id", np.int), ("t1" , np.float64), ("t2" , np.float64) ] ) data = np.array(data, dtype = data_dtype) t1 = data["t1"] t2 = data["t2"] oeis_id = data["oeis_id"] age = t_current - t2 stability = t2 - t1 priority = age / np.maximum(stability, 1e-6) plt.subplot(331) plt.xlabel("oeis id") plt.ylabel("t1 [h]") plt.plot(oeis_id, (t1 - t_current) / 3600.0, '.', markersize = 0.5) plt.subplot(332) plt.xlabel("oeis id") plt.ylabel("t2 [h]") plt.plot(oeis_id, (t2 - t_current) / 3600.0, '.', markersize = 0.5) plt.subplot(334) plt.hist(age / 3600.0, bins = 200, log = True) plt.xlabel("age [h]") plt.subplot(335) plt.xlabel("stability [h]") plt.ylabel("age [h]") plt.plot(stability / 3600.0, age / 3600.0, '.', markersize = 0.5) plt.subplot(336) #bins = np.logspace(-10.0, +20.0, 200) #plt.hist(np.log10(priority), bins = bins, log = True) RANGE_MAX = 0.15 plt.hist(priority, range = (0, RANGE_MAX), bins = 200, log = True) plt.xlabel("priority ({} values > {:.3f})".format(np.sum(priority > RANGE_MAX), RANGE_MAX)) plt.subplot(338) plt.hist(stability / 3600.0, bins = 200, log = True) plt.xlabel("stability [h]") plt.show()