def metadata_for_dwd_data(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = MAIN_FOLDER, write_file: bool = True, create_new_filelist: bool = False): """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the dataframe. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: local file system folder where files should be stored write_file: writes the meta data file to the local file system create_new_filelist: if true: a new file_list for metadata will be created Returns: """ assert isinstance(parameter, Parameter) assert isinstance(time_resolution, TimeResolution) assert isinstance(period_type, PeriodType) assert isinstance(folder, str) assert isinstance(write_file, bool) assert isinstance(create_new_filelist, bool) check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) file_path = create_metainfo_fpath(folder, parameter, period_type, time_resolution) if check_file_exist(file_path) and not create_new_filelist: metainfo = pd.read_csv(filepath_or_buffer=file_path) return metainfo if time_resolution != TimeResolution.MINUTE_1: metainfo = create_metaindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type) else: metainfo = metaindex_for_1minute_data(parameter=parameter, time_resolution=time_resolution, folder=folder) if STATE_NAME not in metainfo.columns: mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE, TimeResolution.DAILY, PeriodType.HISTORICAL, folder=folder, write_file=False, create_new_filelist=False) metainfo = metainfo.merge(mdp.loc[:, [STATIONNAME_NAME, STATE_NAME]], on=STATIONNAME_NAME).reset_index(drop=True) metainfo = add_filepresence(metainfo=metainfo, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, create_new_filelist=create_new_filelist) if write_file and not check_file_exist(file_path) and not \ create_new_filelist: remove_old_file(file_type=METADATA_NAME, file_postfix=DATA_FORMAT, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, subfolder=SUB_FOLDER_METADATA) metainfo.to_csv(path_or_buf=file_path, header=True, index=False) return metainfo
def create_fileindex(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = DWD_FOLDER_MAIN) -> None: """ A function to receive current files on server as list excluding description files and only containing those files that have measuring data. """ # Check for folder and create if necessary create_folder(subfolder=DWD_FOLDER_METADATA, folder=folder) filelist_local_path = Path( folder, DWD_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_" f"{time_resolution.value}_" f"{period_type.value}{DATA_FORMAT}") server_path = PurePosixPath(DWD_PATH, time_resolution.value, parameter.value, period_type.value) try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(remote_path=str(server_path), also_subfolders=True) except ftplib.all_errors as e: raise ftplib.all_errors( "Error: creating a filelist currently not possible.\n" f"{str(e)}") files_server = pd.DataFrame(files_server, columns=[DWDColumns.FILENAME.value], dtype='str') files_server.loc[:, DWDColumns.FILENAME. value] = files_server.loc[:, DWDColumns.FILENAME. value].apply( lambda filename: filename. lstrip(DWD_PATH + '/')) files_server = files_server[files_server.FILENAME.str.contains( ARCHIVE_FORMAT)] files_server.loc[:, DWDColumns.FILEID.value] = files_server.index file_names = files_server.iloc[:, 0].str.split("/").apply( lambda string: string[-1]) files_server.loc[:, DWDColumns.STATION_ID.value] = file_names.apply( lambda x: re.findall(STATID_REGEX, x).pop(0)) files_server = files_server.iloc[:, [1, 2, 0]] files_server.iloc[:, 1] = files_server.iloc[:, 1].astype(int) files_server = files_server.sort_values(by=[DWDColumns.STATION_ID.value]) remove_old_file(file_type=FILELIST_NAME, parameter=parameter, time_resolution=time_resolution, period_type=period_type, file_postfix=DATA_FORMAT, folder=folder, subfolder=DWD_FOLDER_METADATA) files_server.to_csv(path_or_buf=filelist_local_path, header=True, index=False)
def metadata_for_dwd_data(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = DWD_FOLDER_MAIN, write_file: bool = True, create_new_filelist: bool = False) -> pd.DataFrame: """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the dataframe. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: local file system folder where files should be stored write_file: writes the meta data file to the local file system create_new_filelist: if true: a new file_list for metadata will be created Returns: """ if not isinstance(parameter, Parameter): raise TypeError("Error: 'parameter' is not of type Parameter(Enum).") if not isinstance(time_resolution, TimeResolution): raise TypeError( "Error: 'time_resolution' is not of type TimeResolution(Enum).") if not isinstance(period_type, PeriodType): raise TypeError( "Error: 'period_type' is not of type PeriodType(Enum).") if not isinstance(folder, str): raise TypeError("Error: 'folder' is not a string.") if not isinstance(write_file, bool): raise TypeError("Error: 'write_file' is not a bool.") if not isinstance(create_new_filelist, bool): raise TypeError("Error: 'create_new_filelist' is not a bool.") check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) file_path = create_metainfo_fpath(folder, parameter, period_type, time_resolution) if check_file_exist(file_path) and not create_new_filelist: metainfo = pd.read_csv(filepath_or_buffer=file_path) return metainfo if time_resolution == TimeResolution.MINUTE_1: metainfo = metaindex_for_1minute_data(parameter=parameter, time_resolution=time_resolution) else: metainfo = create_metaindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type) if all(pd.isnull(metainfo[DWDColumns.STATE.value])): # @todo avoid calling function in function -> we have to build a function around to manage missing data mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE, TimeResolution.DAILY, PeriodType.HISTORICAL, folder=folder, write_file=False, create_new_filelist=False) stateinfo = pd.merge( metainfo[DWDColumns.STATION_ID], mdp.loc[:, [DWDColumns.STATION_ID.value, DWDColumns.STATE.value]], how="left") metainfo[DWDColumns.STATE.value] = stateinfo[DWDColumns.STATE.value] # for station, state in mdp.loc[:, [DWDColumns.STATIONNAME.value, DWDColumns.STATE.value]]: # metainfo.loc[metainfo[DWDColumns.STATIONNAME.value] == station, DWDColumns.STATE.value] = state metainfo = add_filepresence(metainfo=metainfo, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, create_new_filelist=create_new_filelist) if write_file and not check_file_exist(file_path) and not \ create_new_filelist: remove_old_file(file_type=METADATA_NAME, file_postfix=DATA_FORMAT, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, subfolder=DWD_FOLDER_METADATA) metainfo.to_csv(path_or_buf=file_path, header=True, index=False) return metainfo
def metadata_for_dwd_data(parameter: Union[Parameter, str], time_resolution: Union[TimeResolution, str], period_type: Union[PeriodType, str], folder: str = DWD_FOLDER_MAIN, write_file: bool = True, create_new_file_index: bool = False) -> pd.DataFrame: """ A main function to retrieve metadata for a set of parameters that creates a corresponding csv. STATE information is added to metadata for cases where there's no such named column (e.g. STATE) in the pandas.DataFrame. For this purpose we use daily precipitation data. That has two reasons: - daily precipitation data has a STATE information combined with a city - daily precipitation data is the most common data served by the DWD Args: parameter: observation measure time_resolution: frequency/granularity of measurement interval period_type: recent or historical files folder: local file system folder where files should be stored write_file: writes the meta data file to the local file system create_new_file_index: if true: a new file_list for metadata will be created Returns: pandas.DataFrame with metadata for selected parameters """ if create_new_file_index: reset_file_index_cache() parameter = Parameter(parameter) time_resolution = TimeResolution(time_resolution) period_type = PeriodType(period_type) check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type) file_path = create_metainfo_fpath(folder, parameter, period_type, time_resolution) if time_resolution == TimeResolution.MINUTE_1: metainfo = metaindex_for_1minute_data(parameter=parameter, time_resolution=time_resolution) else: metainfo = create_metaindex(parameter=parameter, time_resolution=time_resolution, period_type=period_type) if all(pd.isnull(metainfo[DWDMetaColumns.STATE.value])): # @todo avoid calling function in function -> we have to build a function around to manage missing data mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE, TimeResolution.DAILY, PeriodType.HISTORICAL, create_new_file_index=False) stateinfo = pd.merge(metainfo[DWDMetaColumns.STATION_ID], mdp.loc[:, [DWDMetaColumns.STATION_ID.value, DWDMetaColumns.STATE.value]], how="left") metainfo[DWDMetaColumns.STATE.value] = stateinfo[DWDMetaColumns.STATE.value] metainfo = add_filepresence(metainfo=metainfo, parameter=parameter, time_resolution=time_resolution, period_type=period_type) if write_file and not file_path.is_file() and create_new_file_index: remove_old_file(file_type=METADATA_NAME, file_postfix=DATA_FORMAT, parameter=parameter, time_resolution=time_resolution, period_type=period_type, folder=folder, subfolder=DWD_FOLDER_METADATA) metainfo.to_csv(path_or_buf=file_path, header=True, index=False) return metainfo
def create_fileindex(parameter: Parameter, time_resolution: TimeResolution, period_type: PeriodType, folder: str = MAIN_FOLDER): """ A function to receive current files on server as list excluding description files and only containing those files that have measuring data. """ # Check for folder and create if necessary create_folder(subfolder=SUB_FOLDER_METADATA, folder=folder) filelist_local_path = Path( folder, SUB_FOLDER_METADATA, f"{FILELIST_NAME}_{parameter.value}_" f"{time_resolution.value}_" f"{period_type.value}{DATA_FORMAT}") filelist_local_path = str(filelist_local_path).replace('\\', '/') server_path = Path(DWD_PATH, time_resolution.value, parameter.value, period_type.value) server_path = f"{server_path}{os.sep}".replace('\\', '/') try: with FTP(DWD_SERVER) as ftp: ftp.login() files_server = ftp.list_files(path=server_path) except Exception: raise NameError( "Download of fileslist file currently not possible. Try again!") files_server = pd.DataFrame(files_server) files_server.columns = [FILENAME_NAME] files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME] \ .apply(str) files_server.loc[:, FILENAME_NAME] = files_server.loc[:, FILENAME_NAME].apply( lambda filename: filename.lstrip(DWD_PATH + '/')) files_server = files_server[files_server.FILENAME.str.contains( ARCHIVE_FORMAT)] files_server \ .insert(loc=1, column=FILEID_NAME, value=files_server.index) files_server \ .insert(loc=2, column=STATION_ID_NAME, value=files_server.iloc[:, 0].str.split('_') .apply(lambda string: string[STRING_STATID_COL.get(period_type, None)])) files_server = files_server.iloc[:, [1, 2, 0]] files_server.iloc[:, 1] = files_server.iloc[:, 1].apply(int) files_server = files_server.sort_values(by=[STATION_ID_NAME]) remove_old_file(file_type=FILELIST_NAME, parameter=parameter, time_resolution=time_resolution, period_type=period_type, file_postfix=DATA_FORMAT, folder=folder, subfolder=SUB_FOLDER_METADATA) files_server.to_csv(path_or_buf=filelist_local_path, header=True, index=False) return None