Пример #1
0
def clean():
    expiry_intervals = {
        'weather': {
            'forecast': '3 hours',
            'current': '48 hours',
        },
        'synop': {
            'synop': '30 hours',
        },
    }
    parsed_files_expiry_intervals = {
        '%/Z__C_EDZW_%': '1 week',
    }
    logger.info('Deleting expired weather records: %s', expiry_intervals)
    with get_connection() as conn:
        with conn.cursor() as cur:
            for table, table_expires in expiry_intervals.items():
                for observation_type, interval in table_expires.items():
                    cur.execute(
                        f"""
                        DELETE FROM {table} WHERE
                            source_id IN (
                                SELECT id FROM sources
                                WHERE observation_type = %s) AND
                            timestamp < current_timestamp - %s::interval;
                        """,
                        (observation_type, interval),
                    )
                    conn.commit()
                    if cur.rowcount:
                        logger.info(
                            'Deleted %d outdated %s weather records from %s',
                            cur.rowcount, observation_type, table)
                cur.execute(f"""
                    UPDATE sources SET
                      first_record = record_range.first_record,
                      last_record = record_range.last_record
                    FROM (
                      SELECT
                        source_id,
                        MIN(timestamp) AS first_record,
                        MAX(timestamp) AS last_record
                      FROM {table}
                      GROUP BY source_id
                    ) AS record_range
                    WHERE sources.id = record_range.source_id;
                    """)
                conn.commit()
            for filename, interval in parsed_files_expiry_intervals.items():
                cur.execute(
                    """
                    DELETE FROM parsed_files WHERE
                        url LIKE %s AND
                        parsed_at < current_timestamp - %s::interval;
                    """, (filename, interval))
                conn.commit()
                if cur.rowcount:
                    logger.info(
                        'Deleted %d outdated parsed files for pattern "%s"',
                        cur.rowcount, filename)
Пример #2
0
 def export(self, records, fingerprint=None):
     records = self.prepare_records(records)
     sources = self.prepare_sources(records)
     with get_connection() as conn:
         source_map = self.update_sources(conn, sources)
         self.update_weather(conn, source_map, records)
         if fingerprint:
             self.update_parsed_files(conn, fingerprint)
Пример #3
0
 def poll(self):
     with get_connection() as conn:
         with conn.cursor() as cur:
             cur.execute('SELECT * FROM parsed_files')
             parsed_files = {
                 row['url']: (row['last_modified'], row['file_size'])
                 for row in cur.fetchall()
             }
     for url in self.urls:
         for file_info in self.poll_url(url):
             fingerprint = (file_info['last_modified'],
                            file_info['file_size'])
             if parsed_files.get(file_info['url']) != fingerprint:
                 yield file_info
Пример #4
0
def db_size():
    m = re.search(r'/(\w+)$', settings.DATABASE_URL)
    db_name = m.group(1) if m else 'postgres'
    with db.get_connection() as conn:
        with conn.cursor() as cur:
            cur.execute('SELECT pg_database_size(%s)', (db_name, ))
            db_size = cur.fetchone()
            table_sizes = {}
            for table in ['weather', 'synop', 'sources']:
                cur.execute('SELECT pg_total_relation_size(%s)', (table, ))
                table_sizes[table] = cur.fetchone()[0]
    click.echo('Total database size:\n%6d MB' % (db_size[0] / 1024 / 1024))
    click.echo('Table sizes:\n' +
               '\n'.join('%6d MB  %s' % (size / 1024 / 1024, table)
                         for table, size in table_sizes.items()))
Пример #5
0
def query_():
    # Generate 50 random locations within Germany's bounding box. Locations
    # and sources will be the same across different runs since we hard-code the
    # PRNG seed.
    random.seed(1)
    location_kwargs = [
        {
            'lat': random.uniform(47.30, 54.98),
            'lon': random.uniform(5.99, 15.02),
        }
        for _ in range(100)]
    with db.get_connection() as conn:
        with conn.cursor() as cur:
            cur.execute(
                """
                SELECT dwd_station_id, id
                FROM sources
                WHERE observation_type = %s
                """,
                ('recent',))
            rows = random.choices(cur.fetchall(), k=100)
            station_kwargs = [
                {'dwd_station_id': row['dwd_station_id']} for row in rows]
            source_kwargs = [{'source_id': row['id']} for row in rows]
            cur.execute(
                """
                SELECT MAX(last_record)
                FROM sources
                WHERE observation_type = 'current'
                """)
            today = cur.fetchone()['max'].date().isoformat()
    date = '2020-02-14'
    last_date = '2020-02-21'

    def _test_with_kwargs(kwargs_list):
        with _time('  100  one-day queries, sequential', precision=2):
            _query_sequential('/weather', kwargs_list, date=date)
        with _time('  100  one-day queries, parallel:  ', precision=2):
            _query_parallel('/weather', kwargs_list, date=date)
        with _time('  100 one-week queries, sequential', precision=2):
            _query_sequential(
                '/weather', kwargs_list, date=date, last_date=last_date)
        with _time('  100 one-week queries, parallel:  ', precision=2):
            _query_parallel(
                '/weather', kwargs_list, date=date, last_date=last_date)

    click.echo('Sources by lat/lon:')
    with _time('  100  queries, sequential:        ', precision=2):
        _query_sequential('/sources', location_kwargs)
    with _time('  100  queries, parallel:          ', precision=2):
        _query_parallel('/sources', location_kwargs)
    click.echo('\nSources by station:')
    with _time('  100  queries, sequential:        ', precision=2):
        _query_sequential('/sources', station_kwargs)
    with _time('  100  queries, parallel:          ', precision=2):
        _query_parallel('/sources', station_kwargs)
    click.echo('\nSources by source:')
    with _time('  100  queries, sequential:        ', precision=2):
        _query_sequential('/sources', source_kwargs)
    with _time('  100  queries, parallel:          ', precision=2):
        _query_parallel('/sources', source_kwargs)

    click.echo('\nWeather by lat/lon:')
    _test_with_kwargs(location_kwargs)
    click.echo('\nWeather by lat/lon, today:')
    with _time('  100  one-day queries, sequential', precision=2):
        _query_sequential('/weather', location_kwargs, date=today)
    with _time('  100  one-day queries, parallel:  ', precision=2):
        _query_parallel('/weather', location_kwargs, date=today)
    click.echo('\nWeather by station:')
    _test_with_kwargs(station_kwargs)
    click.echo('\nWeather by source:')
    _test_with_kwargs(source_kwargs)

    click.echo('\nCurrent weather by lat/lon:')
    with _time('  100  queries, sequential:        ', precision=2):
        _query_sequential('/current_weather', location_kwargs)
    with _time('  100  queries, parallel:          ', precision=2):
        _query_parallel('/current_weather', location_kwargs)
    click.echo('\nCurrent weather by station:')
    with _time('  100  queries, sequential:        ', precision=2):
        _query_sequential('/current_weather', station_kwargs)
    with _time('  100  queries, parallel:          ', precision=2):
        _query_parallel('/current_weather', station_kwargs)
Пример #6
0
def clean():
    expiry_intervals = {
        'weather': {
            'forecast': '3 hours',
            'current': '48 hours',
        },
        'synop': {
            'synop': '30 hours',
        },
    }
    parsed_files_expiry_intervals = {
        '%/Z__C_EDZW_%': '1 week',
    }
    with get_connection() as conn:
        with conn.cursor() as cur:
            # XXX: This assumes that the DWD will upload 'historical' records
            #      for ALL weather parameters at the same time. If this turns
            #      out to be false we should merge the 'historical' and
            #      'recent' observation types into a single one, otherwise we
            #      will have periods where 'historical' records miss certain
            #      weather parameters, and we cannot fall back to the 'recent'
            #      records as they are gone.
            logger.info("Deleting obsolete 'recent' weather records")
            cur.execute("""
                SELECT
                    s_recent.id,
                    s_historical.last_record AS threshold
                FROM sources s_recent
                JOIN sources s_historical ON (
                    s_recent.wmo_station_id = s_historical.wmo_station_id AND
                    s_recent.dwd_station_id = s_historical.dwd_station_id)
                WHERE
                    s_recent.observation_type = 'recent' AND
                    s_historical.observation_type = 'historical' AND
                    s_recent.first_record < s_historical.last_record
                """)
            for row in cur.fetchall():
                logger.debug("Deleting records for source %d prior to %s",
                             row['id'], row['threshold'])
                cur.execute(
                    """
                    DELETE FROM weather
                    WHERE source_id = %s AND timestamp < %s
                    """, (row['id'], row['threshold']))
            conn.commit()
            logger.info('Deleting expired weather records: %s',
                        expiry_intervals)
            for table, table_expires in expiry_intervals.items():
                for observation_type, interval in table_expires.items():
                    cur.execute(
                        f"""
                        DELETE FROM {table} WHERE
                            source_id IN (
                                SELECT id FROM sources
                                WHERE observation_type = %s) AND
                            timestamp < current_timestamp - %s::interval;
                        """,
                        (observation_type, interval),
                    )
                    conn.commit()
                    if cur.rowcount:
                        logger.info(
                            'Deleted %d outdated %s weather records from %s',
                            cur.rowcount, observation_type, table)
                cur.execute(f"""
                    UPDATE sources SET
                      first_record = record_range.first_record,
                      last_record = record_range.last_record
                    FROM (
                      SELECT
                        source_id,
                        MIN(timestamp) AS first_record,
                        MAX(timestamp) AS last_record
                      FROM {table}
                      GROUP BY source_id
                    ) AS record_range
                    WHERE sources.id = record_range.source_id;
                    """)
                conn.commit()
            logger.info('Deleting expired parsed files: %s',
                        parsed_files_expiry_intervals)
            for filename, interval in parsed_files_expiry_intervals.items():
                cur.execute(
                    """
                    DELETE FROM parsed_files WHERE
                        url LIKE %s AND
                        parsed_at < current_timestamp - %s::interval;
                    """, (filename, interval))
                conn.commit()
                if cur.rowcount:
                    logger.info(
                        'Deleted %d outdated parsed files for pattern "%s"',
                        cur.rowcount, filename)