Пример #1
0
    def _parse(self, produkt_path: Path) -> list:
        """
        Parsen des Datenfiles. Für die Feststellung zu unterdrückender Zeilen wird self.station benutzt
        :param produkt_path: Pfad des Datenfiles
        :return: eine Liste von Tupeln, die in die Tabelle readings eingefügt werden können
        """
        def ymdh(yymmddhh: str) -> tuple:
            """
            Aufbrechen der DWD Zeitangabe in numerische Zeiteinheiten.
            :param yymmddhh: Stunde in DWD-Format
            :return: Tuple mit den numerischen Teilkomponenten
            """
            y = int(yymmddhh[:4])
            m = int(yymmddhh[4:6])
            d = int(yymmddhh[6:8])
            h = int(yymmddhh[-2:])
            return y, m, d, h

        with johanna.Timer() as t:
            readings = list()
            with open(produkt_path, newline='') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=';')
                cnt = 0
                shown = 0
                skipped = -1
                for row in spamreader:
                    cnt += 1
                    if cnt == 1:  # skip header line
                        continue
                    # surpress data that might be in DB already
                    if row[1] <= self.station.dwdts_recent:
                        continue
                    elif skipped == -1:  # now uncond.
                        skipped = cnt - 2  # current and first excluded
                        logging.info(
                            f"{skipped} Messwerte vor dem {self.station.dwdts_recent} wurden übersprungen"
                        )
                    if shown <= 1:  # show first 2 rows taken
                        shown += 1
                        logging.info(f"{row[0]}, {row[1]}")
                    y, m, d, h = ymdh(row[1])
                    tup = (
                        int(row[0]),  # station
                        row[1],
                        y,
                        m,
                        d,
                        h,  # row[1],
                        int(row[2]),  # q
                        None
                        if row[3].strip() == "-999" else float(row[3]),  # temp
                        None
                        if row[4].strip() == "-999" else float(row[4])  # humid
                    )
                    readings.append(tup)
        logging.info(
            f"{len(readings)} neue Messwerte für Station {self.station.description} gefunden {t.read()}"
        )
        return readings
Пример #2
0
 def download() -> list:
     collect.zips = list()
     with johanna.Timer() as t:
         rt = ftp.retrlines(f"NLST {station_match}", callback=collect)
     logging.info(rt)  # like "226 Directory send OK."
     logging.info(f"Retrieved {len(collect.zips)} filenames {t.read()}")
     johanna.collect_stat("ftp_download_time_sec", t.read(raw=True))
     johanna.collect_stat("ftp_download_file_cnt", 1)
     return collect.zips
Пример #3
0
def dwd(folder):
    # TODO make this a Context Handler
    SERVER = "opendata.dwd.de"
    with johanna.Timer() as t:
        ftp = FTP(SERVER, timeout=15)
        ftp.login()  # anonymous
        ftp.cwd(folder)
    logging.info(f"Connected to ftp://{SERVER}/{folder} {t.read()}")
    return ftp
Пример #4
0
    def __init__(self, ftp: FTP, fnam: str, verbose: bool = False):
        """
        :param ftp: geöffnete FTP Verbindung mit dem richtigen Arbeitsverzeichnis
        :param fnam: Name des herunterzuladenden Files
        :param verbose: Konsolenausgabe als Fortschrittinfo -- DO NOT USE IN PRODUCTION
        """
        self._verbose = verbose
        self.did_download = False
        logging.info(f'DataFile(_,"{fnam}")')

        station_nr = int(fnam.split(".")[0].split("_")
                         [2])  # geht erfreulicherweise für hist und akt
        self.station = Station(station_nr)
        logging.info(
            f"Station {self.station.description} (Daten bis {self.station.dwdts_recent} bereits vorhanden)"
        )
        if is_data_expected(fnam, self.station):
            with johanna.Timer() as t:
                with TemporaryDirectory() as temp_dir:
                    temp_dir = Path(temp_dir)
                    logging.info(f"Temporäres Verzeichnis: {temp_dir}")
                    zipfile_path = ftplight.ftp_retrbinary(ftp,
                                                           from_fnam=fnam,
                                                           to_path=temp_dir /
                                                           fnam,
                                                           verbose=True)
                    if not zipfile_path:
                        johanna.flag_as_error()
                        logging.error(
                            f"Kann die Daten der Station {self.station.description} nicht herunterladen."
                        )
                        return
                    produkt_path = self._extract(zipfile_path, temp_dir)
                    readings = self._parse(produkt_path)
                    if readings:
                        # TODO connection mit retry absichern
                        with johanna.Connection("insert readings") as c:
                            self._insert_readings(readings, c)
                            last_date = self._update_recent(readings, c)
                            c.commit()  # gemeinsamer commit ist sinnvoll
                        logging.info(
                            f"Werte für Station {self.station.description} bis {last_date} verarbeitet {t.read()}"
                        )
                    else:
                        logging.info(
                            f"Keine Werte für Station {self.station.description} nach {self.station.dwdts_recent} gefunden {t.read()}"
                        )
            if temp_dir.exists():
                johanna.flag_as_error()
                logging.error(
                    f"Temporäres Verzeichnis {temp_dir} wurde NICHT entfernt")
        else:
            logging.info(
                f"File {fnam} wird nicht heruntergeladen, da keine neuen Daten zu erwarten sind."
            )
Пример #5
0
 def _upsert(self):
     with johanna.Timer() as t:
         with johanna.Connection("insert stationen") as c:
             # https://database.guide/how-on-conflict-works-in-sqlite/
             c.cur.executemany(
                 """
                 INSERT OR REPLACE INTO stationen
                 VALUES (?,?,?,?,?,?,?,?)
             """, self.rows)
             c.commit()
     logging.info(
         f"{self.cnt} Stationen in die Datenbank geschrieben {t.read()}")
Пример #6
0
 def _insert_readings(self, readings: list, c: johanna.Connection) -> None:
     with johanna.Timer() as t:
         c.cur.executemany(
             """
             INSERT OR IGNORE INTO readings
             VALUES (?, ?,?,?,?,?, ?, ?,?)
         """, readings)
         # c.commit() -- commit außerhalb
     logging.info(
         f"{len(readings)} Zeilen in die Datenbank eingearbeitet {t.read()}"
     )
     johanna.collect_stat("db_readings_inserted", len(readings))
Пример #7
0
def _download(ds: str) -> List[str]:
    """
    Download station list from DWD
    :param ds: one of the shorthands defined in DATASOURCES
    :return: lines from the datasource
    """
    assert ds in DATASOURCES, f"no such shorthand: {ds}"
    with johanna.Timer() as t:
        ftp = ftplight.dwd(DATASOURCES[ds]["path"])
        lines = ftplight.ftp_retrlines(ftp, from_fnam=DATASOURCES[ds]["fnam"], verbose=True)
        ftp.quit()  # TODO quit() or close()
        logging.info(f"Closed FTP connection to DWD  {t.read()}")
    return lines
Пример #8
0
def _upsert(rows: List[tuple]) -> None:
    with johanna.Timer() as t:
        # database supplied by johanna
        with johanna.Connection(text=f"create? table stations") as c:
            c.cur.executescript(SQL_CREATE_STATIONS)
        with johanna.Connection("insert stations") as c:
            # https://database.guide/how-on-conflict-works-in-sqlite/
            c.cur.executemany("""
                INSERT OR REPLACE INTO stations
                VALUES (?,?,?,?,?,?,?,?, ?,?,?,?,?,?)
            """, rows)
            c.commit()
    logging.info(f"Upserted {len(rows)} stations to the database  {t.read()}")
Пример #9
0
 def download() -> Path:
     collect.cnt = 0
     collect.volume = 0
     with johanna.Timer() as t:
         with open(to_path, 'wb') as collect.open_file:
             rt = ftp.retrbinary("RETR " + from_fnam, collect)
         if verbose:
             print()  # awkward
     logging.info(rt)
     logging.info(
         f"Downloaded {collect.volume:,} bytes in {collect.cnt} blocks {t.read()}"
     )
     johanna.collect_stat("ftp_download_bytes_cnt", collect.volume)
     johanna.collect_stat("ftp_download_time_sec", t.read(raw=True))
     johanna.collect_stat("ftp_download_file_cnt", 1)
     return to_path
Пример #10
0
def _parse(lines: List[str]) -> List[tuple]:
    """
    Parse station list into tuples for database
    :param lines:
    :return: list of tuples suitable for insert into table like described in SQL_CREATE_STATIONS
    """
    with johanna.Timer() as t:
        rows = []
        for line in lines:
            # Format is the same for all files so far...
            if line.startswith("Stations_id") or line.startswith("-----------"):
                pass
            else:
                """
                ....,....1....,....2....,....3....,....4....,....5....,....6....,....7....,....8....,....9....,....0....,....1....,....2....,....3
                04692 20080301 20181130            229     50.8534    7.9966 Siegen (Kläranlage)                      Nordrhein-Westfalen
                """
                parts = line.split()
                station = int(parts[0])
                name = " ".join(parts[6:-1])
                land_short = toolbox.dwdland2short(parts[-1])
                description = f"{station}: {name} [{land_short}]"  # 5717: Wuppertal-Buchenhofen [NRW]
                isodate_from = toolbox.dwdts2iso(parts[1])
                isodate_to = toolbox.dwdts2iso(parts[2])
                tup = (
                    # --- at2h - stationen
                    station,            # station integer,
                    isodate_from,       # yymmdd_von text,
                    isodate_to,         # yymmdd_bis text,
                    int(parts[3]),      # hoehe integer,
                    float(parts[4]),    # breite real,
                    float(parts[5]),    # laenge real,
                    name,               # name text,
                    parts[-1],          # (bundes)land text
                    # --- new fields
                    parts[1],           # dwddate_from TEXT,
                    parts[2],           # dwddate_to TEXT,
                    isodate_from,       # isodate_from TEXT,
                    isodate_to,         # isodate_to TEXT,
                    description,        # description TEXT,
                    land_short,         # land_short TEXT
                )
                rows.append(tup)
        logging.info(f"Found {len(rows)} stations  {t.read()}")
        return rows
Пример #11
0
 def _update_recent(self, readings: list, c: johanna.Connection) -> str:
     # get station, assuming that is the same in all tuples
     station = readings[0][0]
     # get max time of reading from last line
     # alternatively: https://stackoverflow.com/a/4800441/3991164
     yyyymmddhh = readings[-1][1]
     with johanna.Timer() as t:
         # cf. https://stackoverflow.com/a/4330694/3991164
         c.cur.execute(
             """
             INSERT OR REPLACE
             INTO recent (station, yyyymmddhh)
             VALUES (?, ?)
         """, (station, yyyymmddhh))
         # c.commit() -- commit außerhalb
     logging.info(
         f"Neuester Messwert {yyyymmddhh} in der Datenbank vermerkt {t.read()}"
     )
     return yyyymmddhh
Пример #12
0
 def __init__(self, station: Union[int, str]):
     if isinstance(station, str):
         station = int(station)
     sql = """select 
             name, land_short, isodate_from, isodate_to, description
         from stations
         where station = ?"""
     self.station = station
     with johanna.Timer() as t:
         with johanna.Connection(f"Station.__init__({station})") as c:
             c.cur.execute(sql, (station,))
             row = c.cur.fetchone()
             if row:
                 self.name, self.land_short, self.isodate_from, self.isodate_to, self.description = row
                 self.populated = True
             else:
                 self.name, self.land_short, self.isodate_from, self.isodate_to, self.description = (None,) * 5
                 self.populated = False
     logging.info(f"got {self.description}: {self.isodate_from}..{self.isodate_to}  {t.read()}")
Пример #13
0
 def _download(self) -> None:
     with johanna.Timer() as t:
         ftp = ftplight.dwd(
             "climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical"
         )
         fnam = "TU_Stundenwerte_Beschreibung_Stationen.txt"
         self.lines = ftplight.ftp_retrlines(ftp,
                                             from_fnam=fnam,
                                             verbose=True)
         self.rows = []
         self.cnt = 0
         for line in self.lines:
             if line.startswith("Stations_id") or line.startswith(
                     "-----------"):
                 pass
             else:
                 """
                 Format:
                          1         2         3         4         5         6         7         8         9        10
                 ....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|
                 04692 20080301 20181130            229     50.8534    7.9966 Siegen (Kläranlage)                      Nordrhein-Westfalen
                 """
                 parts = line.split()
                 tup = (
                     # Tabelle stationen
                     int(parts[0]),  # station integer,
                     iso_date(parts[1]),  # yymmdd_von text,
                     iso_date(parts[2]),  # yymmdd_bis text,
                     int(parts[3]),  # hoehe integer,
                     float(parts[4]),  # breite real,
                     float(parts[5]),  # laenge real,
                     " ".join(parts[6:-1]),  # name text,
                     parts[-1]  # (bundes)land text
                 )
                 self.rows.append(tup)
                 self.cnt += 1
         logging.info(
             f"{self.cnt} Stationen gelesen und geparst {t.read()}")
         ftp.quit()  # TODO quit() or close()
     logging.info(f"Verbindung zum DWD geschlossen {t.read()}")
Пример #14
0
def plot(plt,
         station: int = const.MANNHEIM,
         monat: int = 6,
         stunde: int = 12,
         von: int = 0,
         bis: int = 3000) -> None:
    """

    :param plt: wird vom jupyter notebook bereitgestellt:
            from matplotlib import pyplot as plt
            %matplotlib inline # <- deswegen!
    :param station: numerischer Stations-Schlüssel
    :param monat: 1 = Januar,...
    :param stunde: 0..23
    :param von: Jahre (4-stell.)
    :param bis: Jahre (4-stell.)
    """
    with johanna.Timer() as overall:
        name = _get_station_name(station)

        with johanna.Timer() as timer:
            with johanna.Connection("select readings") as c:
                c.cur.execute(
                    '''
                    select year, avg(temp) val
                        from readings
                        where station = ?
                          and month = ?
                          and hour = ?
                          and year between ? and ?
                        group by year
                        order by year asc
                ''', (station, monat, stunde, von, bis))
                rows = [row for row in c.cur]
            x_db, y_db = _transpose(rows)
        logging.info(f"Select: {timer.read()}")

        # https://realpython.com/linear-regression-in-python/#simple-linear-regression-with-scikit-learn
        x = np.array(x_db).reshape((-1, 1))
        y = np.array(y_db)

        with johanna.Timer() as timer:
            model = LinearRegression().fit(x, y)
            logging.info(f"dT p.a.: {model.coef_}")
            x_pred = np.array([x_db[0], x_db[-1]]).reshape(
                (-1, 1))  # nur die Enden
            y_pred = model.predict(x_pred)
        logging.info(f"LinearRegression: {timer.read()}")

        # https://towardsdatascience.com/linear-regression-using-python-b136c91bf0a2
        plt.rc('figure', figsize=(20.0, 10.0))
        plt.scatter(x, y, s=10, color='green', label="Einzelwerte")
        plt.xlabel('Jahr')
        plt.ylabel('Mitteltemperatur %d Uhr (UTC), %s, %s' %
                   (stunde, const.monat_as_string(monat), name))
        plt.plot(x_pred, y_pred, color='red', label="Trend")
        # https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend
        plt.legend(loc=4)
        plt.show()

    logging.info(f"Overall: {overall.read()}")