def update_years(): """ To compensate for gaps in the DWD data, where no readings are available. The table is suitable to left outer join yearly aggrates from reading tables to it. :return: """ def days(year): if year == thisyear: last = date.today() else: last = date(year, 12, 31) return (last - date(year, 1, 1)).days + 1 thisyear = date.today().year with johanna.Connection(text=f"create? table years") as c: c.cur.executescript(""" CREATE TABLE IF NOT EXISTS years ( year INTEGER, days INTEGER, PRIMARY KEY (year) ); """) # TODO years interval could be retrieved from the stations table # TODO could be optimized a little bit to not insert when first year in range ia already there and last one is ok years = [(y, days(y)) for y in range(1700, 2051)] with johanna.Connection(text=f"insert? {len(years)} years") as c: c.cur.executemany("INSERT OR REPLACE INTO years VALUES (?, ?)", years) c.commit()
def _upsert(rows: List[tuple]) -> None: with johanna.Timer() as t: # database supplied by johanna with johanna.Connection(text=f"create? table stations") as c: c.cur.executescript(SQL_CREATE_STATIONS) with johanna.Connection("insert stations") as c: # https://database.guide/how-on-conflict-works-in-sqlite/ c.cur.executemany(""" INSERT OR REPLACE INTO stations VALUES (?,?,?,?,?,?,?,?, ?,?,?,?,?,?) """, rows) c.commit() logging.info(f"Upserted {len(rows)} stations to the database {t.read()}")
def __init__(self, station): # select < 0.6 millis :) sql = """select name, land, yyyymmdd_von, yyyymmdd_bis, ifnull(rc.yyyymmddhh, '1700010100'), ifnull(max(rd.dwdts), '1700010100') from stationen s left outer join recent rc on s.station = rc.station left join readings rd on s.station = rd.station where s.station = ?""" self.station = station with johanna.Connection(f"Station.__init__({station})") as c: c.cur.execute(sql, (station, )) row = c.cur.fetchone() if row: self.name = row[0] self.land = row[1] self.isodate_von = row[2] self.isodate_bis = row[3] self.dwdts_recent = row[4] # aus Tabelle self.dwdts_readings = row[5] # aus Daten assert self.dwdts_recent == self.dwdts_readings, \ f"recent: {self.dwdts_recent} vs. Daten: {self.dwdts_readings}" self.populated = True self.description = f"{self.station}, {self.name} ({LAND_MAP[self.land]})" logging.info( f"{self.description}: {self.isodate_von}..{self.isodate_bis} " f"rc={self.dwdts_recent} rd={self.dwdts_readings}") else: self.populated = False
def __init__(self, ftp: FTP, fnam: str, verbose: bool = False): """ :param ftp: geöffnete FTP Verbindung mit dem richtigen Arbeitsverzeichnis :param fnam: Name des herunterzuladenden Files :param verbose: Konsolenausgabe als Fortschrittinfo -- DO NOT USE IN PRODUCTION """ self._verbose = verbose self.did_download = False logging.info(f'DataFile(_,"{fnam}")') station_nr = int(fnam.split(".")[0].split("_") [2]) # geht erfreulicherweise für hist und akt self.station = Station(station_nr) logging.info( f"Station {self.station.description} (Daten bis {self.station.dwdts_recent} bereits vorhanden)" ) if is_data_expected(fnam, self.station): with johanna.Timer() as t: with TemporaryDirectory() as temp_dir: temp_dir = Path(temp_dir) logging.info(f"Temporäres Verzeichnis: {temp_dir}") zipfile_path = ftplight.ftp_retrbinary(ftp, from_fnam=fnam, to_path=temp_dir / fnam, verbose=True) if not zipfile_path: johanna.flag_as_error() logging.error( f"Kann die Daten der Station {self.station.description} nicht herunterladen." ) return produkt_path = self._extract(zipfile_path, temp_dir) readings = self._parse(produkt_path) if readings: # TODO connection mit retry absichern with johanna.Connection("insert readings") as c: self._insert_readings(readings, c) last_date = self._update_recent(readings, c) c.commit() # gemeinsamer commit ist sinnvoll logging.info( f"Werte für Station {self.station.description} bis {last_date} verarbeitet {t.read()}" ) else: logging.info( f"Keine Werte für Station {self.station.description} nach {self.station.dwdts_recent} gefunden {t.read()}" ) if temp_dir.exists(): johanna.flag_as_error() logging.error( f"Temporäres Verzeichnis {temp_dir} wurde NICHT entfernt") else: logging.info( f"File {fnam} wird nicht heruntergeladen, da keine neuen Daten zu erwarten sind." )
def _upsert(self): with johanna.Timer() as t: with johanna.Connection("insert stationen") as c: # https://database.guide/how-on-conflict-works-in-sqlite/ c.cur.executemany( """ INSERT OR REPLACE INTO stationen VALUES (?,?,?,?,?,?,?,?) """, self.rows) c.commit() logging.info( f"{self.cnt} Stationen in die Datenbank geschrieben {t.read()}")
def get_columns(tabnam: str = "readings") -> List[tuple]: """ Get column list for table. Buffered, so you can access as often as you like. But does not return copies, so do not modify the list returned. :param tabnam: table name in current johanna database :return: list of tuples (colnam: str, type: str, primary_key: int) """ if tabnam not in get_columns.buffer: with johanna.Connection(f"columns of {tabnam}") as c: rows = c.cur.execute(SQL_COLUMNS, (tabnam, )).fetchall() get_columns.buffer[tabnam] = rows return get_columns.buffer[tabnam]
def get_two(station: int, dwdts: str, tabname: str = "readings", fields: List[str] = None): if "-" in dwdts: dwdts = dwdts.replace("-", "") if not fields: fields = get_data_fields(tabname) sql = "select " + f"dwdts, {', '.join(fields)} from {tabname} where station = ? and dwdts >= ? order by dwdts limit 2" # logging.info(sql) with johanna.Connection(f"from dwdts = {dwdts}", quiet=True) as c: rows = c.cur.execute(sql, (station, dwdts)).fetchall() return rows
def get_missingdays(station: int, tabname: str = "readings") -> Tuple[list, list]: """ Retrieves :param station: the station to assess :param tabname: name of the table where the readings are stored, defualts to "readings" :return: hit set with the data calculated and list of data fields (see example select) """ sql, fields = generate_missingdays_select(tabname) with johanna.Connection(f"missing days") as c: rows = c.cur.execute(sql, ( station, station, )).fetchall() return rows, fields
def overview(station: int, tabname: str = "readings", fields: List[str] = None, with_rows: bool = False) -> List[Timeframe]: assert isinstance(station, int) assert isinstance(tabname, str) if not fields: fields = get_data_fields(tabname=tabname) assert isinstance(fields, list) assert isinstance(with_rows, bool) sql = get_indicator_select(tabname=tabname, fields=fields) with johanna.Connection(f"select from {tabname}") as c: rows = c.cur.execute(sql, (station, )).fetchall() tfs = [] ts0 = PointInTime(rows[0][0]) srow0 = "".join(rows[0][1:]) # indicator string tf = Timeframe(ts0, None, srow0, None, None) tfs.append(tf) for i, row in enumerate(rows[1:]): ts = PointInTime(row[0]) srow = "".join(row[1:]) # indicator string if ts - ts0 > 1: # not next day # we passed an occurence of '---------' ('-' only) # -> insert n/a interval: [x, _, old] -> [x, ts0, old], [ts0+1, ts-1, n/a], [ts, _, new] tf.ts_to = ts0 tfs.append(Timeframe(ts0.next(), ts.prev(), "no data", None, None)) tf = Timeframe(ts, None, srow, None, None) tfs.append(tf) elif srow != srow0: tf.ts_to = ts0 tf = Timeframe(ts, None, srow, None, None) tfs.append(tf) ts0 = ts srow0 = srow tf.ts_to = ts for tf in tfs: tf.days = tf.ts_to - tf.ts_from + 1 if with_rows: for tf in tfs: tf.rows = get_two(station, tf.ts_to.dwdts(), tabname=tabname, fields=fields) return tfs
def __init__(self, station: Union[int, str]): if isinstance(station, str): station = int(station) sql = """select name, land_short, isodate_from, isodate_to, description from stations where station = ?""" self.station = station with johanna.Timer() as t: with johanna.Connection(f"Station.__init__({station})") as c: c.cur.execute(sql, (station,)) row = c.cur.fetchone() if row: self.name, self.land_short, self.isodate_from, self.isodate_to, self.description = row self.populated = True else: self.name, self.land_short, self.isodate_from, self.isodate_to, self.description = (None,) * 5 self.populated = False logging.info(f"got {self.description}: {self.isodate_from}..{self.isodate_to} {t.read()}")
#!/usr/bin/env python # coding: utf-8 """ Simple test program for johanna in interactive mode. Creates and uses a ~/.johanna folder which can be disposed at will. Created: 06.09.20 """ import johanna if __name__ == "__main__": johanna.interactive(dbname="hurz.sqlite") # johanna.apply_schema("./schema.sql") with johanna.Connection("Charlotte") as c: # need to run charlotte.py c.cur.execute("select * from kvpairs") for row in c.cur: print(row) johanna.flag_as_error()
def main(): johanna.apply_schema("./schema.sql") with johanna.Connection("Charlotte") as c: c.cur.execute("insert or ignore into kvpairs(k, v) values (1, 'eins')") c.commit() johanna.flag_as_error()
def plot(plt, station: int = const.MANNHEIM, monat: int = 6, stunde: int = 12, von: int = 0, bis: int = 3000) -> None: """ :param plt: wird vom jupyter notebook bereitgestellt: from matplotlib import pyplot as plt %matplotlib inline # <- deswegen! :param station: numerischer Stations-Schlüssel :param monat: 1 = Januar,... :param stunde: 0..23 :param von: Jahre (4-stell.) :param bis: Jahre (4-stell.) """ with johanna.Timer() as overall: name = _get_station_name(station) with johanna.Timer() as timer: with johanna.Connection("select readings") as c: c.cur.execute( ''' select year, avg(temp) val from readings where station = ? and month = ? and hour = ? and year between ? and ? group by year order by year asc ''', (station, monat, stunde, von, bis)) rows = [row for row in c.cur] x_db, y_db = _transpose(rows) logging.info(f"Select: {timer.read()}") # https://realpython.com/linear-regression-in-python/#simple-linear-regression-with-scikit-learn x = np.array(x_db).reshape((-1, 1)) y = np.array(y_db) with johanna.Timer() as timer: model = LinearRegression().fit(x, y) logging.info(f"dT p.a.: {model.coef_}") x_pred = np.array([x_db[0], x_db[-1]]).reshape( (-1, 1)) # nur die Enden y_pred = model.predict(x_pred) logging.info(f"LinearRegression: {timer.read()}") # https://towardsdatascience.com/linear-regression-using-python-b136c91bf0a2 plt.rc('figure', figsize=(20.0, 10.0)) plt.scatter(x, y, s=10, color='green', label="Einzelwerte") plt.xlabel('Jahr') plt.ylabel('Mitteltemperatur %d Uhr (UTC), %s, %s' % (stunde, const.monat_as_string(monat), name)) plt.plot(x_pred, y_pred, color='red', label="Trend") # https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend plt.legend(loc=4) plt.show() logging.info(f"Overall: {overall.read()}")