예제 #1
0
def _download_dwd_data(download_specification: Tuple[Union[str, Path],
                                                     Union[str, Path]]):
    """
    This function downloads the stationdata for which the link is
    provided by the 'select_dwd' function. It checks the shortened filepath (just
    the zipfile) for its parameters, creates the full filepath and downloads the
    file(s) according to the set up folder.

    Args:
        download_specification: contains path to file that should be downloaded
            and the path to the folder to store the files

    Returns:
        stores data on local file system

    """
    remote_file, folder = download_specification

    create_folder(subfolder=SUB_FOLDER_STATIONDATA, folder=folder)

    file_server = create_remote_file_name(remote_file)
    file_local = create_local_file_name(remote_file, folder)

    try:
        # Open connection with ftp server
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            ftp_file_download(ftp, Path(file_server), Path(file_local))

    except Exception:
        # In the end raise an error naming the files that couldn't be loaded.
        raise NameError(f"The file\n {file_local} \n couldn't be downloaded!")
예제 #2
0
def create_metaindex(parameter: Parameter, time_resolution: TimeResolution,
                     period_type: PeriodType) -> pd.DataFrame:
    """ The function is used to create a simple metadata DataFrame parsed from the text files that are located in each
    data section of the station data directory of the weather service.

    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
    Return:
        DataFrame with parsed columns of the corresponding text file. Columns are translated into English and data is
        not yet complete as file existence is not checked.

    """
    server_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                parameter.value, period_type.value)

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(remote_path=str(server_path),
                                          also_subfolders=False)

    except ftplib.all_errors as e:
        raise ftplib.all_errors(
            "Error: couldn't retrieve filelist from server.\n"
            f"{str(e)}")

    metafile_server = [
        file for file in files_server
        if find_all_matchstrings_in_string(file.lower(), METADATA_MATCHSTRINGS)
    ].pop(0)

    metafile_server = create_remote_file_name(metafile_server.lstrip(DWD_PATH))

    try:
        with urllib.request.urlopen(metafile_server) as request:
            file = BytesIO(request.read())

    except urllib.error.URLError as e:
        raise urllib.error.URLError("Error: reading metadata file failed.\n"
                                    f"{str(e)}")

    metaindex = pd.read_fwf(filepath_or_buffer=file,
                            colspecs=METADATA_FIXED_COLUMN_WIDTH,
                            skiprows=[1],
                            dtype=str,
                            encoding="ISO-8859-1")

    # Fix column names, as header is not aligned to fixed column widths
    metaindex.columns = "".join([
        column for column in metaindex.columns
        if "unnamed" not in column.lower()
    ]).split(" ")

    metaindex = metaindex.rename(columns=str.upper).rename(
        columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING)

    return metaindex.astype(METADATA_DTYPE_MAPPING)
예제 #3
0
def create_file_index_for_dwd_server(parameter: Parameter,
                                     time_resolution: TimeResolution,
                                     period_type: PeriodType) -> pd.DataFrame:
    """
    Function (cached) to create a file index of the DWD station data. The file index
    is created for an individual set of parameters.
    Args:
        parameter: parameter of Parameter enumeration
        time_resolution: time resolution of TimeResolution enumeration
        period_type: period type of PeriodType enumeration
    Returns:
        file index in a pandas.DataFrame with sets of parameters and station id
    """
    server_path = PurePosixPath(DWD_PATH) / time_resolution.value / \
        parameter.value / period_type.value

    # todo: replace with global requests.Session creating the index
    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(remote_path=str(server_path),
                                          also_subfolders=True)

    except ftplib.all_errors as e:
        raise e("Creating file index currently not possible.")

    files_server = pd.DataFrame(files_server,
                                columns=[DWDMetaColumns.FILENAME.value],
                                dtype='str')

    # Filter for .zip files
    files_server = files_server[files_server.FILENAME.str.endswith(
        ARCHIVE_FORMAT)]

    files_server.loc[:, DWDMetaColumns.FILENAME.value] = files_server.loc[:, DWDMetaColumns.FILENAME.value].\
        str.replace(DWD_PATH + '/', '')

    file_names = files_server.loc[:, DWDMetaColumns.FILENAME.value].str.split(
        "/").apply(lambda strings: strings[-1])

    files_server.loc[:, DWDMetaColumns.STATION_ID.value] = file_names.apply(
        lambda x: re.findall(STATID_REGEX, x).pop(0))

    files_server.loc[:, DWDMetaColumns.STATION_ID.value] = files_server.loc[:, DWDMetaColumns.STATION_ID.value].\
        astype(int)

    files_server = files_server.sort_values(
        by=[DWDMetaColumns.STATION_ID.value, DWDMetaColumns.FILENAME.value])

    return files_server.loc[:, [
        DWDMetaColumns.STATION_ID.value, DWDMetaColumns.FILENAME.value
    ]]
예제 #4
0
def metaindex_for_1minute_data(
        parameter: Parameter, time_resolution: TimeResolution) -> pd.DataFrame:
    """
    A helping function to create a raw index of metadata for stations of the set of
    parameters as given. This raw metadata is then used by other functions. This
    second/alternative function must be used for high resolution data, where the
    metadata is not available as file but instead saved in external files per each
    station.
    - especially for precipitation/1_minute/historical!

    """

    assert time_resolution == TimeResolution.MINUTE_1, \
        "Wrong TimeResolution, only 1 minute is valid "

    metadata_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                  parameter.value, FTP_METADATA_NAME)

    with FTP(DWD_SERVER) as ftp:
        ftp.login()

        metadata_filepaths = ftp.list_files(remote_path=str(metadata_path),
                                            also_subfolders=False)

    metadata_filepaths = [
        create_remote_file_name(file.lstrip(DWD_PATH))
        for file in metadata_filepaths
    ]

    statids = [
        re.findall(STATID_REGEX, file).pop(0) for file in metadata_filepaths
    ]

    metaindex_df = pd.DataFrame(None, columns=METADATA_COLUMNS)

    metadata_files = Pool().map(download_metadata_file_for_1minute_data,
                                metadata_filepaths)

    metadata_dfs = Pool().map(combine_geo_and_par_file_to_metadata_df,
                              zip(metadata_files, statids))

    metaindex_df = metaindex_df.append(other=metadata_dfs, ignore_index=True)

    metaindex_df = metaindex_df.astype(METADATA_DTYPE_MAPPING)

    return metaindex_df.sort_values(
        DWDColumns.STATION_ID.value).reset_index(drop=True)
예제 #5
0
def create_fileindex(parameter: Parameter,
                     time_resolution: TimeResolution,
                     period_type: PeriodType,
                     folder: str = DWD_FOLDER_MAIN) -> None:
    """
        A function to receive current files on server as list excluding description
        files and only containing those files that have measuring data.

    """
    # Check for folder and create if necessary
    create_folder(subfolder=DWD_FOLDER_METADATA, folder=folder)

    filelist_local_path = Path(
        folder, DWD_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_"
        f"{time_resolution.value}_"
        f"{period_type.value}{DATA_FORMAT}")

    server_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                parameter.value, period_type.value)

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(remote_path=str(server_path),
                                          also_subfolders=True)

    except ftplib.all_errors as e:
        raise ftplib.all_errors(
            "Error: creating a filelist currently not possible.\n"
            f"{str(e)}")

    files_server = pd.DataFrame(files_server,
                                columns=[DWDColumns.FILENAME.value],
                                dtype='str')

    files_server.loc[:, DWDColumns.FILENAME.
                     value] = files_server.loc[:, DWDColumns.FILENAME.
                                               value].apply(
                                                   lambda filename: filename.
                                                   lstrip(DWD_PATH + '/'))

    files_server = files_server[files_server.FILENAME.str.contains(
        ARCHIVE_FORMAT)]

    files_server.loc[:, DWDColumns.FILEID.value] = files_server.index

    file_names = files_server.iloc[:, 0].str.split("/").apply(
        lambda string: string[-1])

    files_server.loc[:, DWDColumns.STATION_ID.value] = file_names.apply(
        lambda x: re.findall(STATID_REGEX, x).pop(0))

    files_server = files_server.iloc[:, [1, 2, 0]]

    files_server.iloc[:, 1] = files_server.iloc[:, 1].astype(int)

    files_server = files_server.sort_values(by=[DWDColumns.STATION_ID.value])

    remove_old_file(file_type=FILELIST_NAME,
                    parameter=parameter,
                    time_resolution=time_resolution,
                    period_type=period_type,
                    file_postfix=DATA_FORMAT,
                    folder=folder,
                    subfolder=DWD_FOLDER_METADATA)

    files_server.to_csv(path_or_buf=filelist_local_path,
                        header=True,
                        index=False)
예제 #6
0
def metaindex_for_1minute_data(parameter: Parameter,
                               time_resolution: TimeResolution, folder):
    """
    A helping function to create a raw index of metadata for stations of the set of
    parameters as given. This raw metadata is then used by other functions. This
    second/alternative function must be used for high resolution data, where the
    metadata is not available as file but instead saved in external files per each
    station.
    - especially for precipitation/1_minute/historical!

    """

    assert time_resolution == TimeResolution.MINUTE_1, \
        "Wrong TimeResolution, only 1 minute is valid "

    metadata_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                  parameter.value, FTP_METADATA_NAME)

    metadata_path = str(metadata_path)

    with FTP(DWD_SERVER) as ftp:
        ftp.login()

        metadata_server = ftp.nlst(metadata_path)

    metadata_local = [
        str(Path(folder, SUB_FOLDER_METADATA,
                 metadata_file.split("/")[-1]))
        for metadata_file in metadata_server
    ]

    metadata_df = pd.DataFrame(None, columns=METADATA_1MIN_COLUMNS)

    for metafile_server, metafile_local in tqdm(zip(metadata_server,
                                                    metadata_local),
                                                total=len(metadata_server)):
        with FTP(DWD_SERVER) as ftp:
            ftp.login()

            ftp.download(metafile_server, metafile_local)

        with ZipFile(metafile_local) as zip_file:
            zip_file_files = zip_file.infolist()

            zip_file_files = [
                zip_file_file.filename for zip_file_file in zip_file_files
            ]

            file_geo = [
                zip_file_file for zip_file_file in zip_file_files if all([
                    matchstring in zip_file_file.lower()
                    for matchstring in METADATA_1MIN_GEO_MATCHSTRINGS
                ])
            ].pop(0)

            file_par = [
                zip_file_file for zip_file_file in zip_file_files if all([
                    matchstring in zip_file_file.lower()
                    for matchstring in METADATA_1MIN_PAR_MATCHSTRINGS
                ])
            ].pop(0)

            with zip_file.open(file_geo) as file_opened:
                try:
                    geo_file = parse_zipped_data_into_df(file_opened)
                except UnicodeDecodeError:
                    geo_file = parse_zipped_data_into_df(file_opened,
                                                         engine='python')

            with zip_file.open(file_par) as file_opened:
                try:
                    par_file = parse_zipped_data_into_df(file_opened)
                except UnicodeDecodeError:
                    par_file = parse_zipped_data_into_df(file_opened,
                                                         engine='python')

        Path(metafile_local).unlink()

        geo_file.columns = [
            GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(name.strip().upper(),
                                                  name.strip().upper())
            for name in geo_file.columns
        ]

        par_file.columns = [
            GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(name.strip().upper(),
                                                  name.strip().upper())
            for name in par_file.columns
        ]

        geo_file = geo_file.iloc[[-1], :]
        par_file = par_file.loc[:, [FROM_DATE_NAME, TO_DATE_NAME]].dropna()

        geo_file[FROM_DATE_NAME] = par_file[FROM_DATE_NAME].min()
        geo_file[TO_DATE_NAME] = par_file[TO_DATE_NAME].max()

        geo_file = geo_file.loc[:, METADATA_1MIN_COLUMNS]

        metadata_df = metadata_df.append(geo_file, ignore_index=True)

    columns = metadata_df.columns
    META_INDEX_DTYPES = {
        columns[0]: int,
        columns[1]: datetime64,
        columns[2]: datetime64,
        columns[3]: float,
        columns[4]: float,
        columns[5]: float,
        columns[6]: str
    }
    metadata_df = metadata_df.astype(META_INDEX_DTYPES)

    metadata_df = metadata_df.sort_values(STATION_ID_NAME).reset_index(
        drop=True)

    return metadata_df
예제 #7
0
def create_metaindex(parameter: Parameter, time_resolution: TimeResolution,
                     period_type: PeriodType) -> pd.DataFrame:
    """
    @todo: please specify what this function does
    Args:
        parameter: observation measure
        time_resolution: frequency/granularity of measurement interval
        period_type: recent or historical files
    Return:


    """
    server_path = PurePosixPath(DWD_PATH, time_resolution.value,
                                parameter.value, period_type.value)

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(path=str(server_path))

    except Exception:
        raise NameError("Couldn't retrieve filelist from server")

    metafile_server = [
        file for file in files_server if all([
            matchstring in file.lower()
            for matchstring in METADATA_MATCHSTRINGS
        ])
    ].pop(0)

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            file = ftp.read_file_to_bytes(metafile_server)

    except Exception:
        raise NameError(
            "Reading metadata file currently is not possible. Try again!")

    metaindex = pd.read_fwf(filepath_or_buffer=file,
                            colspecs=METADATA_FIXED_COLUMN_WIDTH,
                            skiprows=[1],
                            dtype=str)

    metaindex_colnames = [
        colname for colname in metaindex.columns
        if "unnamed" not in colname.lower()
    ]
    metaindex_colnames_fixed = "".join(metaindex_colnames).split(" ")
    metaindex.columns = [
        GERMAN_TO_ENGLISH_COLUMNS_MAPPING.get(name.upper(), name.upper())
        for name in metaindex_colnames_fixed
    ]

    columns = metaindex.columns
    META_INDEX_DTYPES = {
        columns[0]: int,
        columns[1]: datetime64,
        columns[2]: datetime64,
        columns[3]: float,
        columns[4]: float,
        columns[5]: float,
        columns[6]: str,
        columns[7]: str
    }
    metaindex = metaindex.astype(META_INDEX_DTYPES)

    return metaindex
예제 #8
0
def create_fileindex(parameter: Parameter,
                     time_resolution: TimeResolution,
                     period_type: PeriodType,
                     folder: str = MAIN_FOLDER):
    """
        A function to receive current files on server as list excluding description
        files and only containing those files that have measuring data.

    """
    # Check for folder and create if necessary
    create_folder(subfolder=SUB_FOLDER_METADATA, folder=folder)

    filelist_local_path = Path(
        folder, SUB_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_"
        f"{time_resolution.value}_"
        f"{period_type.value}{DATA_FORMAT}")

    filelist_local_path = str(filelist_local_path).replace('\\', '/')

    server_path = Path(DWD_PATH, time_resolution.value, parameter.value,
                       period_type.value)

    server_path = f"{server_path}{os.sep}".replace('\\', '/')

    try:
        with FTP(DWD_SERVER) as ftp:
            ftp.login()
            files_server = ftp.list_files(path=server_path)

    except Exception:
        raise NameError(
            "Download of fileslist file currently not possible. Try again!")

    files_server = pd.DataFrame(files_server)

    files_server.columns = [FILENAME_NAME]

    files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME] \
        .apply(str)

    files_server.loc[:,
                     FILENAME_NAME] = files_server.loc[:, FILENAME_NAME].apply(
                         lambda filename: filename.lstrip(DWD_PATH + '/'))

    files_server = files_server[files_server.FILENAME.str.contains(
        ARCHIVE_FORMAT)]

    files_server \
        .insert(loc=1,
                column=FILEID_NAME,
                value=files_server.index)

    files_server \
        .insert(loc=2,
                column=STATION_ID_NAME,
                value=files_server.iloc[:, 0].str.split('_')
                .apply(lambda string: string[STRING_STATID_COL.get(period_type, None)]))

    files_server = files_server.iloc[:, [1, 2, 0]]

    files_server.iloc[:, 1] = files_server.iloc[:, 1].apply(int)

    files_server = files_server.sort_values(by=[STATION_ID_NAME])

    remove_old_file(file_type=FILELIST_NAME,
                    parameter=parameter,
                    time_resolution=time_resolution,
                    period_type=period_type,
                    file_postfix=DATA_FORMAT,
                    folder=folder,
                    subfolder=SUB_FOLDER_METADATA)

    files_server.to_csv(path_or_buf=filelist_local_path,
                        header=True,
                        index=False)

    return None