def clean(): expiry_intervals = { 'weather': { 'forecast': '3 hours', 'current': '48 hours', }, 'synop': { 'synop': '30 hours', }, } parsed_files_expiry_intervals = { '%/Z__C_EDZW_%': '1 week', } logger.info('Deleting expired weather records: %s', expiry_intervals) with get_connection() as conn: with conn.cursor() as cur: for table, table_expires in expiry_intervals.items(): for observation_type, interval in table_expires.items(): cur.execute( f""" DELETE FROM {table} WHERE source_id IN ( SELECT id FROM sources WHERE observation_type = %s) AND timestamp < current_timestamp - %s::interval; """, (observation_type, interval), ) conn.commit() if cur.rowcount: logger.info( 'Deleted %d outdated %s weather records from %s', cur.rowcount, observation_type, table) cur.execute(f""" UPDATE sources SET first_record = record_range.first_record, last_record = record_range.last_record FROM ( SELECT source_id, MIN(timestamp) AS first_record, MAX(timestamp) AS last_record FROM {table} GROUP BY source_id ) AS record_range WHERE sources.id = record_range.source_id; """) conn.commit() for filename, interval in parsed_files_expiry_intervals.items(): cur.execute( """ DELETE FROM parsed_files WHERE url LIKE %s AND parsed_at < current_timestamp - %s::interval; """, (filename, interval)) conn.commit() if cur.rowcount: logger.info( 'Deleted %d outdated parsed files for pattern "%s"', cur.rowcount, filename)
def export(self, records, fingerprint=None): records = self.prepare_records(records) sources = self.prepare_sources(records) with get_connection() as conn: source_map = self.update_sources(conn, sources) self.update_weather(conn, source_map, records) if fingerprint: self.update_parsed_files(conn, fingerprint)
def poll(self): with get_connection() as conn: with conn.cursor() as cur: cur.execute('SELECT * FROM parsed_files') parsed_files = { row['url']: (row['last_modified'], row['file_size']) for row in cur.fetchall() } for url in self.urls: for file_info in self.poll_url(url): fingerprint = (file_info['last_modified'], file_info['file_size']) if parsed_files.get(file_info['url']) != fingerprint: yield file_info
def db_size(): m = re.search(r'/(\w+)$', settings.DATABASE_URL) db_name = m.group(1) if m else 'postgres' with db.get_connection() as conn: with conn.cursor() as cur: cur.execute('SELECT pg_database_size(%s)', (db_name, )) db_size = cur.fetchone() table_sizes = {} for table in ['weather', 'synop', 'sources']: cur.execute('SELECT pg_total_relation_size(%s)', (table, )) table_sizes[table] = cur.fetchone()[0] click.echo('Total database size:\n%6d MB' % (db_size[0] / 1024 / 1024)) click.echo('Table sizes:\n' + '\n'.join('%6d MB %s' % (size / 1024 / 1024, table) for table, size in table_sizes.items()))
def query_(): # Generate 50 random locations within Germany's bounding box. Locations # and sources will be the same across different runs since we hard-code the # PRNG seed. random.seed(1) location_kwargs = [ { 'lat': random.uniform(47.30, 54.98), 'lon': random.uniform(5.99, 15.02), } for _ in range(100)] with db.get_connection() as conn: with conn.cursor() as cur: cur.execute( """ SELECT dwd_station_id, id FROM sources WHERE observation_type = %s """, ('recent',)) rows = random.choices(cur.fetchall(), k=100) station_kwargs = [ {'dwd_station_id': row['dwd_station_id']} for row in rows] source_kwargs = [{'source_id': row['id']} for row in rows] cur.execute( """ SELECT MAX(last_record) FROM sources WHERE observation_type = 'current' """) today = cur.fetchone()['max'].date().isoformat() date = '2020-02-14' last_date = '2020-02-21' def _test_with_kwargs(kwargs_list): with _time(' 100 one-day queries, sequential', precision=2): _query_sequential('/weather', kwargs_list, date=date) with _time(' 100 one-day queries, parallel: ', precision=2): _query_parallel('/weather', kwargs_list, date=date) with _time(' 100 one-week queries, sequential', precision=2): _query_sequential( '/weather', kwargs_list, date=date, last_date=last_date) with _time(' 100 one-week queries, parallel: ', precision=2): _query_parallel( '/weather', kwargs_list, date=date, last_date=last_date) click.echo('Sources by lat/lon:') with _time(' 100 queries, sequential: ', precision=2): _query_sequential('/sources', location_kwargs) with _time(' 100 queries, parallel: ', precision=2): _query_parallel('/sources', location_kwargs) click.echo('\nSources by station:') with _time(' 100 queries, sequential: ', precision=2): _query_sequential('/sources', station_kwargs) with _time(' 100 queries, parallel: ', precision=2): _query_parallel('/sources', station_kwargs) click.echo('\nSources by source:') with _time(' 100 queries, sequential: ', precision=2): _query_sequential('/sources', source_kwargs) with _time(' 100 queries, parallel: ', precision=2): _query_parallel('/sources', source_kwargs) click.echo('\nWeather by lat/lon:') _test_with_kwargs(location_kwargs) click.echo('\nWeather by lat/lon, today:') with _time(' 100 one-day queries, sequential', precision=2): _query_sequential('/weather', location_kwargs, date=today) with _time(' 100 one-day queries, parallel: ', precision=2): _query_parallel('/weather', location_kwargs, date=today) click.echo('\nWeather by station:') _test_with_kwargs(station_kwargs) click.echo('\nWeather by source:') _test_with_kwargs(source_kwargs) click.echo('\nCurrent weather by lat/lon:') with _time(' 100 queries, sequential: ', precision=2): _query_sequential('/current_weather', location_kwargs) with _time(' 100 queries, parallel: ', precision=2): _query_parallel('/current_weather', location_kwargs) click.echo('\nCurrent weather by station:') with _time(' 100 queries, sequential: ', precision=2): _query_sequential('/current_weather', station_kwargs) with _time(' 100 queries, parallel: ', precision=2): _query_parallel('/current_weather', station_kwargs)
def clean(): expiry_intervals = { 'weather': { 'forecast': '3 hours', 'current': '48 hours', }, 'synop': { 'synop': '30 hours', }, } parsed_files_expiry_intervals = { '%/Z__C_EDZW_%': '1 week', } with get_connection() as conn: with conn.cursor() as cur: # XXX: This assumes that the DWD will upload 'historical' records # for ALL weather parameters at the same time. If this turns # out to be false we should merge the 'historical' and # 'recent' observation types into a single one, otherwise we # will have periods where 'historical' records miss certain # weather parameters, and we cannot fall back to the 'recent' # records as they are gone. logger.info("Deleting obsolete 'recent' weather records") cur.execute(""" SELECT s_recent.id, s_historical.last_record AS threshold FROM sources s_recent JOIN sources s_historical ON ( s_recent.wmo_station_id = s_historical.wmo_station_id AND s_recent.dwd_station_id = s_historical.dwd_station_id) WHERE s_recent.observation_type = 'recent' AND s_historical.observation_type = 'historical' AND s_recent.first_record < s_historical.last_record """) for row in cur.fetchall(): logger.debug("Deleting records for source %d prior to %s", row['id'], row['threshold']) cur.execute( """ DELETE FROM weather WHERE source_id = %s AND timestamp < %s """, (row['id'], row['threshold'])) conn.commit() logger.info('Deleting expired weather records: %s', expiry_intervals) for table, table_expires in expiry_intervals.items(): for observation_type, interval in table_expires.items(): cur.execute( f""" DELETE FROM {table} WHERE source_id IN ( SELECT id FROM sources WHERE observation_type = %s) AND timestamp < current_timestamp - %s::interval; """, (observation_type, interval), ) conn.commit() if cur.rowcount: logger.info( 'Deleted %d outdated %s weather records from %s', cur.rowcount, observation_type, table) cur.execute(f""" UPDATE sources SET first_record = record_range.first_record, last_record = record_range.last_record FROM ( SELECT source_id, MIN(timestamp) AS first_record, MAX(timestamp) AS last_record FROM {table} GROUP BY source_id ) AS record_range WHERE sources.id = record_range.source_id; """) conn.commit() logger.info('Deleting expired parsed files: %s', parsed_files_expiry_intervals) for filename, interval in parsed_files_expiry_intervals.items(): cur.execute( """ DELETE FROM parsed_files WHERE url LIKE %s AND parsed_at < current_timestamp - %s::interval; """, (filename, interval)) conn.commit() if cur.rowcount: logger.info( 'Deleted %d outdated parsed files for pattern "%s"', cur.rowcount, filename)