def bufr(config, output_file=None, cycle='18'): """ Generates model data from BUFKIT profiles and saves to a file, which can later be retrieved for either training data or model run data. :param config: :param output_file: str: output file path :param cycle: str: model cycle (init hour) :return: """ bufr_station_id = config['BUFR']['bufr_station_id'] # Base arguments dictionary. dset and date will be modified iteratively. bufarg = { 'dset': '', 'date': '', 'cycle': cycle, 'stations': bufr_station_id.lower(), 'noascii': '', 'nozipit': '', 'noverbose': '', 'prepend': '' } if config['verbose']: print('\n') bufr_default_dir = '%s/metdat/bufkit' % config['BUFR_ROOT'] bufr_data_dir = config['BUFR']['bufr_data_dir'] if not (os.path.isdir(bufr_data_dir)): os.makedirs(bufr_data_dir) bufrgruven = config['BUFR']['bufrgruven'] if config['verbose']: print('bufr: using BUFKIT files in %s' % bufr_data_dir) bufr_format = '%s/%s%s.%s_%s.buf' missing_dates = [] models = config['BUFR']['bufr_models'] model_names = config['BUFR']['models'] start_date = datetime.strptime(config['data_start_date'], '%Y%m%d') - timedelta(days=1) end_date = datetime.strptime(config['data_end_date'], '%Y%m%d') - timedelta(days=1) dates = generate_dates(config, start_date=start_date, end_date=end_date) for date in dates: bufarg['date'] = datetime.strftime(date, '%Y%m%d') if date.year < 2010: if config['verbose']: print('bufr: skipping BUFR data for %s; data starts in 2010.' % bufarg['date']) continue if config['verbose']: print('bufr: date: %s' % bufarg['date']) for m in range(len(models)): if config['verbose']: print('bufr: trying to retrieve BUFR data for %s...' % model_names[m]) bufr_new_name = bufr_format % (bufr_data_dir, bufarg['date'], '%02d' % int(bufarg['cycle']), model_names[m], bufarg['stations']) if os.path.isfile(bufr_new_name): if config['verbose']: print('bufr: file %s already exists; skipping!' % bufr_new_name) break if type(models[m]) == list: for model in models[m]: try: bufarg['dset'] = model bufr_retrieve(bufrgruven, bufarg) bufr_name = bufr_format % ( bufr_default_dir, bufarg['date'], '%02d' % int(bufarg['cycle']), model, bufarg['stations']) bufr_file = open(bufr_name) bufr_file.close() os.rename(bufr_name, bufr_new_name) if config['verbose']: print('bufr: BUFR file found for %s at date %s.' % (model, bufarg['date'])) print('bufr: writing BUFR file: %s' % bufr_new_name) break except: if config['verbose']: print( 'bufr: BUFR file for %s at date %s not retrieved.' % (model, bufarg['date'])) else: try: model = models[m] bufarg['dset'] = model bufr_retrieve(bufrgruven, bufarg) bufr_name = bufr_format % ( bufr_default_dir, bufarg['date'], '%02d' % int(bufarg['cycle']), bufarg['dset'], bufarg['stations']) bufr_file = open(bufr_name) bufr_file.close() os.rename(bufr_name, bufr_new_name) if config['verbose']: print('bufr: BUFR file found for %s at date %s.' % (model, bufarg['date'])) print('bufr: writing BUFR file: %s' % bufr_new_name) except: if config['verbose']: print( 'bufr: BUFR file for %s at date %s not retrieved.' % (model, bufarg['date'])) if not (os.path.isfile(bufr_new_name)): print( 'bufr: warning: no BUFR file found for model %s at date %s' % (model_names[m], bufarg['date'])) missing_dates.append((date, model_names[m])) # Process data print('\n') bufr_dict = OrderedDict({ 'PROF': OrderedDict(), 'SFC': OrderedDict(), 'DAY': OrderedDict() }) for model in model_names: bufr_dict['PROF'][model] = OrderedDict() bufr_dict['SFC'][model] = OrderedDict() bufr_dict['DAY'][model] = OrderedDict() for date in dates: date_str = datetime.strftime(date, '%Y%m%d') verif_date = date + timedelta(days=1) start_dt = verif_date + timedelta(hours=config['forecast_hour_start']) end_dt = verif_date + timedelta(hours=config['forecast_hour_start'] + 24) for model in model_names: if (date, model) in missing_dates: if config['verbose']: print('bufr: skipping %s data for %s; file missing.' % (model, date_str)) continue if config['verbose']: print('bufr: processing %s data for %s' % (model, date_str)) bufr_name = bufr_format % (bufr_data_dir, date_str, '%02d' % int( bufarg['cycle']), model, bufarg['stations']) if not (os.path.isfile(bufr_name)): if config['verbose']: print('bufr: skipping %s data for %s; file missing.' % (model, date_str)) continue profile = bufkit_parser_time_height(config, bufr_name, 6, start_dt, end_dt) sfc, daily = bufkit_parser_surface(bufr_name, 3, start_dt, end_dt) # Drop 'PRES' variable which is useless for key, values in profile.items(): values.pop('PRES', None) profile[key] = values bufr_dict['PROF'][model][verif_date] = profile bufr_dict['SFC'][model][verif_date] = sfc bufr_dict['DAY'][model][verif_date] = daily # Export data if output_file is None: output_file = '%s/%s_bufr.pkl' % (config['SITE_ROOT'], config['station_id']) if config['verbose']: print('bufr: -> exporting to %s' % output_file) with open(output_file, 'wb') as handle: pickle.dump(bufr_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) return
def verification(config, output_files=None, csv_files=None, use_cf6=True, use_climo=True, force_rain_quantity=False): """ Generates verification data from MesoWest and saves to a file, which is used to train the model and check test results. :param config: :param output_files: str: output file path if just one station, or list of output file paths if multiple stations :param csv_files: str: path to csv file containing observations if just one station, or list of paths to csv files if multiple stations :param use_cf6: bool: if True, uses wind values from CF6 files :param use_climo: bool: if True, uses wind values from NCDC climatology :param force_rain_quantity: if True, returns the actual quantity of rain (rather than POP); useful for validation files :return: """ if config['multi_stations']: #Train on multiple stations station_ids = config['station_id'] if len(station_ids) != len( output_files ): #There has to be the same number of output files as station IDs, so raise error if not raise ValueError( "There must be the same number of output files as station IDs") if len(station_ids) != len( csv_files ): #There has to be the same number of output files as station IDs, so raise error if not raise ValueError( "There must be the same number of csv files as station IDs") else: station_ids = [config['station_id']] if output_files is not None: output_files = [output_files] if csv_files is not None: csv_files = [csv_files] for i in range(len(station_ids)): station_id = station_ids[i] if output_files is None: output_file = '%s/%s_verif.pkl' % (config['SITE_ROOT'], station_id) else: output_file = output_files[i] if csv_files is None: csv_file = '%s/%s_verif.csv' % (config['SITE_ROOT'], station_id) else: csv_file = csv_files[i] dates = generate_dates(config) api_dates = generate_dates(config, api=True, api_add_hour=config['forecast_hour_start'] + 24) datename = 'date_time_minus_%d' % config['forecast_hour_start'] if config['verbose']: print('verification: obtaining observations from csv file') with open( '%s/%s_obs_vars_request.txt' % (config['SITE_ROOT'], station_id), 'rb') as fp: vars_request = pickle.load(fp) all_obspd = pd.read_csv(csv_file) obspd = all_obspd[[ 'date_time', 'air_temp', 'precip_accum_one_hour', 'wind_speed', 'air_temp_low_6_hour', 'air_temp_high_6_hour', 'precip_accum_six_hour' ]] #subset of data used as verification obspd['date_time'] = np.array([ datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in obspd['date_time'].values ], dtype='datetime64[s]') if config['verbose']: print( 'verification: setting time back %d hours for daily statistics' % config['forecast_hour_start']) dateobj = pd.to_datetime(obspd['date_time']) - timedelta( hours=config['forecast_hour_start']) obspd['date_time'] = dateobj obspd = obspd.rename(columns={'date_time': datename}) # Reformat data into hourly and daily # Hourly def hour(dates): date = dates.iloc[0] if type( date ) == str: #if data is from csv file, date will be a string instead of a datetime object #depending on which version of NumPy or pandas you use, the first or second statement will work try: date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') except: date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S+00:00') return datetime(date.year, date.month, date.day, date.hour) def last(values): return values.iloc[-1] aggregate = {datename: hour} if 'air_temp_high_6_hour' in vars_request and 'air_temp_low_6_hour' in vars_request: aggregate['air_temp_high_6_hour'] = np.max aggregate['air_temp_low_6_hour'] = np.min aggregate['air_temp'] = { 'air_temp_max': np.max, 'air_temp_min': np.min } if 'precip_accum_six_hour' in vars_request: aggregate['precip_accum_six_hour'] = np.max aggregate['wind_speed'] = np.max aggregate['precip_accum_one_hour'] = np.max if config['verbose']: print( 'verification: grouping data by hour for hourly observations') # Note that obs in hour H are reported at hour H, not H+1 obs_hourly = obspd.groupby([ pd.DatetimeIndex(obspd[datename]).year, pd.DatetimeIndex(obspd[datename]).month, pd.DatetimeIndex(obspd[datename]).day, pd.DatetimeIndex(obspd[datename]).hour ]).agg(aggregate) # Rename columns col_names = obs_hourly.columns.values col_names_new = [] for c in range(len(col_names)): if col_names[c][0] == 'air_temp': col_names_new.append(col_names[c][1]) else: col_names_new.append(col_names[c][0]) obs_hourly.columns = col_names_new # Daily def day(dates): date = dates.iloc[0] if type( date ) == str: #if data is from csv file, date will be a string instead of a datetime object #depending on which version of NumPy or pandas you use, the first or second statement will work try: date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') except: date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S+00:00') return datetime(date.year, date.month, date.day) def min_or_nan(a): ''' Returns the minimum of a 1D array if there are at least 4 non-NaN values, and returns NaN otherwise. This is to ensure having NaNs on days with incomplete data when grouping into daily data rather than incorrect data. ''' if np.count_nonzero(~np.isnan(a)) < 4: #incomplete data return np.nan else: return np.min(a) def max_or_nan(a): ''' Returns the maximum of a 1D array if there are at least 4 non-NaN values, and returns NaN otherwise. This is to ensure having NaNs on days with incomplete data when grouping into daily data rather than incorrect data. ''' if np.count_nonzero(~np.isnan(a)) < 4: #incomplete data return np.nan else: return np.max(a) aggregate[datename] = day aggregate['air_temp_min'] = np.min aggregate['air_temp_max'] = np.max aggregate['air_temp_low_6_hour'] = min_or_nan aggregate['air_temp_high_6_hour'] = max_or_nan aggregate['precip_accum_one_hour'] = np.sum aggregate['precip_accum_six_hour'] = np.sum try: aggregate.pop('air_temp') except: pass if config['verbose']: print('verification: grouping data by day for daily verifications') obs_daily = obs_hourly.groupby([ pd.DatetimeIndex(obs_hourly[datename]).year, pd.DatetimeIndex(obs_hourly[datename]).month, pd.DatetimeIndex(obs_hourly[datename]).day ]).agg(aggregate) obs_hourly_copy = obs_hourly.copy() obs_hourly_copy.set_index(datename, inplace=True) if config['verbose']: print( 'verification: checking matching dates for daily obs and CF6') if use_climo: try: climo_values = _climo(config, station_id, dates) except BaseException as e: if config['verbose']: print( "verification: warning: '%s' while reading climo data" % str(e)) climo_values = {} else: if config['verbose']: print('verification: not using climo.') climo_values = {} if use_cf6: num_months = min((datetime.utcnow() - dates[0]).days / 30, 24) try: get_cf6_files(config, station_id, num_months) except BaseException as e: if config['verbose']: print( "verification: warning: '%s' while getting CF6 files" % str(e)) try: cf6_values = _cf6(config, station_id) except BaseException as e: if config['verbose']: print( "verification: warning: '%s' while reading CF6 files" % str(e)) cf6_values = {} else: if config['verbose']: print('verification: not using CF6.') cf6_values = {} climo_values.update(cf6_values) # CF6 has precedence count_rows = 0 for index, row in obs_daily.iterrows(): date = row[datename] use_cf6_precip = False if 'air_temp_high_6_hour' in vars_request: max_temp_var = 'air_temp_high_6_hour' else: max_temp_var = 'air_temp_max' if 'air_temp_low_6_hour' in vars_request: min_temp_var = 'air_temp_low_6_hour' else: min_temp_var = 'air_temp_min' if 'precip_accum_six_hour' in vars_request: precip_var = 'precip_accum_six_hour' else: precip_var = 'precip_accum_one_hour' obs_max_temp = row[max_temp_var] obs_min_temp = row[min_temp_var] obs_wind = row['wind_speed'] obs_precip = round(row[precip_var], 2) if np.isnan(obs_max_temp) and np.isnan( obs_min_temp ): #if high or low temperature is missing, chances are some precipitation data is missing too use_cf6_precip = True # Check for missing or incorrect 6-hour precipitation amounts. If there are any, use sum of 1-hour precipitation amounts if none are missing. skip_date = False if 'precip_accum_six_hour' in vars_request: #6-hour precipitation amounts were used daily_precip = 0.0 for hour in [ 5, 11, 17, 23 ]: #check the 4 times which should have 6-hour precipitation amounts try: obs_6hr_precip = round( obs_hourly_copy['precip_accum_six_hour'][ pd.Timestamp(date.year, date.month, date.day, hour)], 2) except KeyError: #incomplete data for date skip_date = True use_cf6_precip = True break if np.isnan(obs_6hr_precip): obs_6hr_precip = 0.0 sum_hourly_precip = 0.0 for hour2 in range( hour - 5, hour + 1): #check and sum 1-hour precipitation amounts obs_hourly_precip = obs_hourly_copy[ 'precip_accum_one_hour'][pd.Timestamp( date.year, date.month, date.day, hour2)] if np.isnan( obs_hourly_precip ): #missing 1-hour precipitation amount, so use cf6/climo value instead use_cf6_precip = True else: sum_hourly_precip += round(obs_hourly_precip, 2) if sum_hourly_precip > obs_6hr_precip and not use_cf6_precip: #Missing or incorrect 6-hour precipitation amount but 1-hour precipitation amounts are OK obs_6hr_precip = round(sum_hourly_precip, 2) daily_precip += round(obs_6hr_precip, 2) if (round(daily_precip, 2) > round(obs_precip, 2) and not use_cf6_precip): print( 'verification: warning: incorrect obs precip of %0.2f for %s, using summed one hour accumulation value of %0.2f' % (obs_precip, date, daily_precip)) obs_daily.loc[index, 'precip_accum_six_hour'] = daily_precip else: #1-hour precipitation amounts were used for hour in range(24): try: obs_hourly_precip = obs_hourly_copy[ 'precip_accum_one_hour'][pd.Timestamp( date.year, date.month, date.day, hour)] except KeyError: #incomplete data for date skip_date = True break if np.isnan(obs_hourly_precip): use_cf6_precip = True if skip_date: obs_daily.loc[index, max_temp_var] = np.nan obs_daily.loc[index, min_temp_var] = np.nan obs_daily.loc[index, 'wind_speed'] = np.nan obs_daily.loc[index, precip_var] = np.nan if date in climo_values.keys() and not skip_date: count_rows += 1 cf6_max_temp = climo_values[date]['max_temp'] cf6_min_temp = climo_values[date]['min_temp'] cf6_wind = climo_values[date]['wind'] cf6_precip = climo_values[date]['precip'] if not (np.isnan(cf6_max_temp) ) and cf6_max_temp > -900.0 and np.isnan(obs_max_temp): print( 'verification: warning: missing obs max temp for %s, using cf6/climo value of %d' % (date, round(cf6_max_temp, 0))) obs_daily.loc[index, max_temp_var] = cf6_max_temp if not (np.isnan(cf6_min_temp) ) and cf6_min_temp < 900.0 and np.isnan(obs_min_temp): print( 'verification: warning: missing obs min temp for %s, using cf6/climo value of %d' % (date, round(cf6_min_temp, 0))) obs_daily.loc[index, min_temp_var] = cf6_min_temp if not (np.isnan(cf6_wind)): if obs_wind > cf6_wind and obs_wind < cf6_wind + 10: print( 'verification: warning: obs wind for %s larger than wind from cf6/climo; using obs' % date) else: obs_daily.loc[index, 'wind_speed'] = cf6_wind else: count_rows -= 1 if not (np.isnan(cf6_precip) ) and cf6_precip > -900.0 and use_cf6_precip and round( cf6_precip, 2) > round(obs_precip, 2): print( 'verification: warning: incorrect obs precip of %0.2f for %s, using cf6/climo value of %0.2f' % (obs_precip, date, cf6_precip)) obs_daily.loc[index, precip_var] = cf6_precip if config['verbose']: print('verification: found %d matching rows.' % count_rows) # Round round_dict = {'wind_speed': 0} if 'air_temp_high_6_hour' in vars_request: round_dict['air_temp_high_6_hour'] = 0 if 'air_temp_low_6_hour' in vars_request: round_dict['air_temp_low_6_hour'] = 0 round_dict['air_temp_max'] = 0 round_dict['air_temp_min'] = 0 if 'precip_accum_six_hour' in vars_request: round_dict['precip_accum_six_hour'] = 2 round_dict['precip_accum_one_hour'] = 2 obs_daily = obs_daily.round(round_dict) # Generation of final output data if config['verbose']: print('verification: generating final verification dictionary...') if 'air_temp_high_6_hour' in vars_request: obs_daily.rename(columns={'air_temp_high_6_hour': 'Tmax'}, inplace=True) else: obs_daily.rename(columns={'air_temp_max': 'Tmax'}, inplace=True) if 'air_temp_low_6_hour' in vars_request: obs_daily.rename(columns={'air_temp_low_6_hour': 'Tmin'}, inplace=True) else: obs_daily.rename(columns={'air_temp_min': 'Tmin'}, inplace=True) if 'precip_accum_six_hour' in vars_request: obs_daily.rename(columns={'precip_accum_six_hour': 'Rain'}, inplace=True) else: obs_daily.rename(columns={'precip_accum_one_hour': 'Rain'}, inplace=True) obs_daily.rename(columns={'wind_speed': 'Wind'}, inplace=True) # Deal with the rain depending on the type of forecast requested obs_daily['Rain'].fillna(0.0, inplace=True) if config['Model'][ 'rain_forecast_type'] == 'pop' and not force_rain_quantity: obs_daily.loc[:, 'Rain'] = pop_rain(obs_daily['Rain']) elif config['Model'][ 'rain_forecast_type'] == 'categorical' and not force_rain_quantity: obs_daily.loc[:, 'Rain'] = categorical_rain(obs_daily['Rain']) # Set the date time index and retain only desired columns obs_daily = obs_daily.rename(columns={datename: 'date_time'}) obs_daily = obs_daily.set_index('date_time') if config['verbose']: print('verification: -> exporting to %s' % output_file) export_cols = ['Tmax', 'Tmin', 'Wind', 'Rain'] for col in obs_daily.columns: if col not in export_cols: obs_daily.drop(col, 1, inplace=True) # If a time series is desired, then get hourly data from csv file if config['Model']['predict_timeseries']: obs_hourly_verify = all_obspd[[ 'date_time', 'air_temp', 'relative_humidity', 'wind_speed', 'precip_accum_one_hour' ]] #subset of data used as verification # Fix rainfall for categorical and time accumulation rain_column = 'precip_last_%d_hour' % config['time_series_interval'] obs_hourly_verify.rename( columns={'precip_accum_one_hour': rain_column}, inplace=True) if config['Model'][ 'rain_forecast_type'] == 'pop' and not force_rain_quantity: if config['verbose']: print("verification: using 'pop' rain") obs_hourly_verify.loc[:, rain_column] = pop_rain( obs_hourly_verify[rain_column]) use_rain_max = True elif config['Model'][ 'rain_forecast_type'] == 'categorical' and not force_rain_quantity: if config['verbose']: print("verification: using 'categorical' rain") obs_hourly_verify.loc[:, rain_column] = categorical_rain( obs_hourly_verify[rain_column]) use_rain_max = True else: use_rain_max = False # Export final data export_dict = OrderedDict() for date in dates: try: day_dict = obs_daily.loc[date].to_dict(into=OrderedDict) except KeyError: continue if np.any(np.isnan(list(day_dict.values()))): if config['verbose']: print( 'verification: warning: omitting day %s; missing data' % date) continue # No verification can have missing values if config['Model']['predict_timeseries']: start = pd.Timestamp(date + timedelta( hours=(config['forecast_hour_start'] - config['time_series_interval']))) end = pd.Timestamp(date + timedelta( hours=config['forecast_hour_start'] + 24)) try: series = reindex_hourly(obs_hourly_verify, start, end, config['time_series_interval'], use_rain_max=use_rain_max) except KeyError: # No values for the day if config['verbose']: print( 'verification: warning: omitting day %s; missing data' % date) continue if series.isnull().values.any(): if config['verbose']: print( 'verification: warning: omitting day %s; missing data' % date) continue series_dict = OrderedDict(series.to_dict(into=OrderedDict)) day_dict.update(series_dict) export_dict[date] = day_dict with open(output_file, 'wb') as handle: pickle.dump(export_dict, handle, protocol=2) return
def obs(config, output_files=None, csv_files=None, num_hours=24, interval=3, use_nan_sounding=False, use_existing_sounding=True): """ Generates observation data from MesoWest and UCAR soundings and saves to a file, which can later be retrieved for either training data or model run data. :param config: :param output_files: str: output file path if just one station, or list of output file paths if multiple stations :param csv_files: str: path to csv file containing observations if just one station, or list of paths to csv files if multiple stations :param num_hours: int: number of hours to retrieve obs :param interval: int: retrieve obs every 'interval' hours :param use_nan_sounding: bool: if True, uses a sounding of NaNs rather than omitting a day if sounding is missing :param use_existing_sounding: bool: if True, preferentially uses saved soundings in sounding_data_dir :return: """ if config['multi_stations']: #Train on multiple stations station_ids = config['station_id'] if len(station_ids) != len( output_files ): #There has to be the same number of output files as station IDs, so raise error if not raise ValueError( "There must be the same number of output files as station IDs") if len(station_ids) != len( csv_files ): #There has to be the same number of output files as station IDs, so raise error if not raise ValueError( "There must be the same number of csv files as station IDs") else: station_ids = [config['station_id']] if output_files is not None: output_files = [output_files] if csv_files is not None: csv_files = [csv_files] for i in range(len(station_ids)): station_id = station_ids[i] if output_files is None: output_file = '%s/%s_obs.pkl' % (config['SITE_ROOT'], station_id) else: output_file = output_files[i] if csv_files is None: csv_file = '%s/%s_obs.csv' % (config['SITE_ROOT'], station_id) else: csv_file = csv_files[i] start_date = datetime.strptime(config['data_start_date'], '%Y%m%d') - timedelta(hours=num_hours) dates = generate_dates(config) api_dates = generate_dates(config, api=True, start_date=start_date) # Retrieve station data if not os.path.exists(csv_file): #no observations saved yet # Look for desired variables vars_request = [ 'air_temp', 'altimeter', 'precip_accum_one_hour', 'relative_humidity', 'wind_speed', 'wind_direction' ] vars_option = [ 'air_temp_low_6_hour', 'air_temp_high_6_hour', 'precip_accum_six_hour' ] m = Meso(token=config['meso_token']) if config['verbose']: print('obs: MesoPy initialized for station %s' % config['station_id']) print('obs: retrieving latest obs and metadata') latest = m.latest(stid=station_id) obs_list = list(latest['STATION'][0]['SENSOR_VARIABLES'].keys()) # Add variables to the api request if they exist if config['verbose']: print('obs: searching for 6-hourly variables...') for var in vars_option: if var in obs_list: if config['verbose']: print('obs: found variable %s, adding to data' % var) vars_request += [var] # Add variables to the api request vars_api = '' for var in vars_request: vars_api += var + ',' vars_api = vars_api[:-1] # Units units = 'temp|f,precip|in,speed|kts' all_obs_hourly = get_obs_hourly(config, station_id, api_dates, vars_api, units) try: all_obs_hourly.to_csv(csv_file) if config['verbose']: print('obs: saving observations to csv file succeeded') with open( '%s/%s_obs_vars_request.txt' % (config['SITE_ROOT'], station_id), 'wb') as fp: pickle.dump(vars_request, fp, protocol=2) if config['verbose']: print( 'obs: saving vars request list to txt file succeeded') except BaseException as e: if config['verbose']: print("obs: warning: '%s' while saving observations" % str(e)) obs_hourly = all_obs_hourly[[ 'air_temp', 'altimeter', 'precip_accum_one_hour', 'relative_humidity', 'wind_speed', 'wind_direction' ]] #subset of data used as predictors else: if config['verbose']: print('obs: obtaining observations from csv file') all_obs_hourly = pd.read_csv(csv_file) with open( '%s/%s_obs_vars_request.txt' % (config['SITE_ROOT'], station_id), 'rb') as fp: vars_request = pickle.load(fp) obs_hourly = all_obs_hourly[[ 'date_time', 'air_temp', 'altimeter', 'precip_accum_one_hour', 'relative_humidity', 'wind_speed', 'wind_direction' ]] #subset of data used as predictors # Retrieve upper-air sounding data soundings = OrderedDict() if config['Obs']['use_soundings']: if config['verbose']: print('obs: retrieving upper-air sounding data') for date in dates: soundings[date] = OrderedDict() start_date = date - timedelta( days=1) # get the previous day's soundings for hour in [0, 12]: sounding_date = start_date + timedelta(hours=hour) try: sounding = upper_air( config, station_id, sounding_station_id, sounding_date, use_nan_sounding, use_existing=use_existing_sounding) soundings[date][sounding_date] = sounding except: print( 'obs: warning: problem retrieving soundings for %s' % datetime.strftime(date, '%Y%m%d')) soundings.pop(date) break # Create dictionary of days if config['verbose']: print('obs: converting to output dictionary') obs_export = OrderedDict({'SFC': OrderedDict(), 'SNDG': OrderedDict()}) for date in dates: if config['Obs']['use_soundings'] and date not in soundings.keys(): continue # Need to ensure we use the right intervals to have 22:5? Z obs start = pd.Timestamp((date - timedelta(hours=num_hours))) end = pd.Timestamp(date) obs_export['SFC'][date] = reindex_hourly( obs_hourly, start, end, interval, end_23z=True).to_dict(into=OrderedDict) if config['Obs']['use_soundings']: obs_export['SNDG'][date] = soundings[date] # Export final data if config['verbose']: print('obs: -> exporting to %s' % output_file) with open(output_file, 'wb') as handle: pickle.dump(obs_export, handle, protocol=2) return
def verification(config, output_file=None, use_cf6=True, use_climo=True, force_rain_quantity=False): """ Generates verification data from MesoWest and saves to a file, which is used to train the model and check test results. :param config: :param output_file: str: path to output file :param use_cf6: bool: if True, uses wind values from CF6 files :param use_climo: bool: if True, uses wind values from NCDC climatology :param force_rain_quantity: if True, returns the actual quantity of rain (rather than POP); useful for validation files :return: """ if output_file is None: output_file = '%s/%s_verif.pkl' % (config['SITE_ROOT'], config['station_id']) dates = generate_dates(config) api_dates = generate_dates(config, api=True, api_add_hour=config['forecast_hour_start'] + 24) # Read new data for daily values m = Meso(token=config['meso_token']) if config['verbose']: print('verification: MesoPy initialized for station %s' % config['station_id']) print('verification: retrieving latest obs and metadata') latest = m.latest(stid=config['station_id']) obs_list = list(latest['STATION'][0]['SENSOR_VARIABLES'].keys()) # Look for desired variables vars_request = ['air_temp', 'wind_speed', 'precip_accum_one_hour'] vars_option = [ 'air_temp_low_6_hour', 'air_temp_high_6_hour', 'precip_accum_six_hour' ] # Add variables to the api request if they exist if config['verbose']: print('verification: searching for 6-hourly variables...') for var in vars_option: if var in obs_list: if config['verbose']: print('verification: found variable %s, adding to data' % var) vars_request += [var] vars_api = '' for var in vars_request: vars_api += var + ',' vars_api = vars_api[:-1] # Units units = 'temp|f,precip|in,speed|kts' # Retrieve data obspd = pd.DataFrame() for api_date in api_dates: if config['verbose']: print('verification: retrieving data from %s to %s' % api_date) obs = m.timeseries(stid=config['station_id'], start=api_date[0], end=api_date[1], vars=vars_api, units=units) obspd = pd.concat( (obspd, pd.DataFrame.from_dict(obs['STATION'][0]['OBSERVATIONS'])), ignore_index=True) # Rename columns to requested vars obs_var_names = obs['STATION'][0]['SENSOR_VARIABLES'] obs_var_keys = list(obs_var_names.keys()) col_names = list(map(''.join, obspd.columns.values)) for c in range(len(col_names)): col = col_names[c] for k in range(len(obs_var_keys)): key = obs_var_keys[k] if col == list(obs_var_names[key].keys())[0]: col_names[c] = key obspd.columns = col_names # Make sure we have columns for all requested variables for var in vars_request: if var not in col_names: obspd = obspd.assign(**{var: np.nan}) # Change datetime column to datetime object, subtract 6 hours to use 6Z days if config['verbose']: print('verification: setting time back %d hours for daily statistics' % config['forecast_hour_start']) dateobj = pd.to_datetime( obspd['date_time']) - timedelta(hours=config['forecast_hour_start']) obspd['date_time'] = dateobj datename = 'date_time_minus_%d' % config['forecast_hour_start'] obspd = obspd.rename(columns={'date_time': datename}) # Reformat data into hourly and daily # Hourly def hour(dates): date = dates.iloc[0] return datetime(date.year, date.month, date.day, date.hour) def last(values): return values.iloc[-1] aggregate = {datename: hour} if 'air_temp_high_6_hour' in vars_request and 'air_temp_low_6_hour' in vars_request: aggregate['air_temp_high_6_hour'] = np.max aggregate['air_temp_low_6_hour'] = np.min aggregate['air_temp'] = {'air_temp_max': np.max, 'air_temp_min': np.min} if 'precip_accum_six_hour' in vars_request: aggregate['precip_accum_six_hour'] = np.max aggregate['wind_speed'] = np.max aggregate['precip_accum_one_hour'] = np.max if config['verbose']: print('verification: grouping data by hour for hourly observations') # Note that obs in hour H are reported at hour H, not H+1 obs_hourly = obspd.groupby([ pd.DatetimeIndex(obspd[datename]).year, pd.DatetimeIndex(obspd[datename]).month, pd.DatetimeIndex(obspd[datename]).day, pd.DatetimeIndex(obspd[datename]).hour ]).agg(aggregate) # Rename columns col_names = obs_hourly.columns.values col_names_new = [] for c in range(len(col_names)): if col_names[c][0] == 'air_temp': col_names_new.append(col_names[c][1]) else: col_names_new.append(col_names[c][0]) obs_hourly.columns = col_names_new # Daily def day(dates): date = dates.iloc[0] return datetime(date.year, date.month, date.day) aggregate[datename] = day aggregate['air_temp_min'] = np.min aggregate['air_temp_max'] = np.max aggregate['precip_accum_six_hour'] = np.sum try: aggregate.pop('air_temp') except: pass if config['verbose']: print('verification: grouping data by day for daily verifications') obs_daily = obs_hourly.groupby([ pd.DatetimeIndex(obs_hourly[datename]).year, pd.DatetimeIndex(obs_hourly[datename]).month, pd.DatetimeIndex(obs_hourly[datename]).day ]).agg(aggregate) if config['verbose']: print('verification: checking matching dates for daily obs and CF6') if use_climo: try: climo_values = _climo_wind(config, dates) except BaseException as e: if config['verbose']: print("verification: warning: '%s' while reading climo data" % str(e)) climo_values = {} else: if config['verbose']: print('verification: not using climo.') climo_values = {} if use_cf6: num_months = min((datetime.utcnow() - dates[0]).days / 30, 24) try: get_cf6_files(config, num_months) except BaseException as e: if config['verbose']: print("verification: warning: '%s' while getting CF6 files" % str(e)) try: cf6_values = _cf6_wind(config) except BaseException as e: if config['verbose']: print("verification: warning: '%s' while reading CF6 files" % str(e)) cf6_values = {} else: if config['verbose']: print('verification: not using CF6.') cf6_values = {} climo_values.update(cf6_values) # CF6 has precedence count_rows = 0 for index, row in obs_daily.iterrows(): date = row[datename] if date in climo_values.keys(): count_rows += 1 obs_wind = row['wind_speed'] cf6_wind = climo_values[date]['wind'] if not (np.isnan(cf6_wind)): if obs_wind - cf6_wind >= 5: print( 'verification: warning: obs wind for %s much larger than wind from cf6/climo; using obs' % date) else: obs_daily.loc[index, 'wind_speed'] = cf6_wind else: count_rows -= 1 if config['verbose']: print('verification: found %d matching rows.' % count_rows) # Round round_dict = {'wind_speed': 0} if 'air_temp_high_6_hour' in vars_request: round_dict['air_temp_high_6_hour'] = 0 if 'air_temp_low_6_hour' in vars_request: round_dict['air_temp_low_6_hour'] = 0 round_dict['air_temp_max'] = 0 round_dict['air_temp_min'] = 0 if 'precip_accum_six_hour' in vars_request: round_dict['precip_accum_six_hour'] = 2 round_dict['precip_accum_one_hour'] = 2 obs_daily = obs_daily.round(round_dict) # Generation of final output data if config['verbose']: print('verification: generating final verification dictionary...') if 'air_temp_high_6_hour' in vars_request: obs_daily.rename(columns={'air_temp_high_6_hour': 'Tmax'}, inplace=True) else: obs_daily.rename(columns={'air_temp_max': 'Tmax'}, inplace=True) if 'air_temp_low_6_hour' in vars_request: obs_daily.rename(columns={'air_temp_low_6_hour': 'Tmin'}, inplace=True) else: obs_daily.rename(columns={'air_temp_min': 'Tmin'}, inplace=True) if 'precip_accum_six_hour' in vars_request: obs_daily.rename(columns={'precip_accum_six_hour': 'Rain'}, inplace=True) else: obs_daily.rename(columns={'precip_accum_one_hour': 'Rain'}, inplace=True) obs_daily.rename(columns={'wind_speed': 'Wind'}, inplace=True) # Deal with the rain depending on the type of forecast requested obs_daily['Rain'].fillna(0.0, inplace=True) if config['Model'][ 'rain_forecast_type'] == 'pop' and not force_rain_quantity: obs_daily.loc[:, 'Rain'] = pop_rain(obs_daily['Rain']) elif config['Model'][ 'rain_forecast_type'] == 'categorical' and not force_rain_quantity: obs_daily.loc[:, 'Rain'] = categorical_rain(obs_daily['Rain']) # Set the date time index and retain only desired columns obs_daily = obs_daily.rename(columns={datename: 'date_time'}) obs_daily = obs_daily.set_index('date_time') if config['verbose']: print('verification: -> exporting to %s' % output_file) export_cols = ['Tmax', 'Tmin', 'Wind', 'Rain'] for col in obs_daily.columns: if col not in export_cols: obs_daily.drop(col, 1, inplace=True) # If a time series is desired, then get hourly data if config['Model']['predict_timeseries']: # Look for desired variables vars_request = [ 'air_temp', 'relative_humidity', 'wind_speed', 'precip_accum_one_hour' ] # Add variables to the api request vars_api = '' for var in vars_request: vars_api += var + ',' vars_api = vars_api[:-1] # Units units = 'temp|f,precip|in,speed|kts' # Retrieve data obs_hourly_verify = get_obs_hourly(config, api_dates, vars_api, units) # Fix rainfall for categorical and time accumulation rain_column = 'precip_last_%d_hour' % config['time_series_interval'] obs_hourly_verify.rename( columns={'precip_accum_one_hour': rain_column}, inplace=True) if config['Model'][ 'rain_forecast_type'] == 'pop' and not force_rain_quantity: if config['verbose']: print("verification: using 'pop' rain") obs_hourly_verify.loc[:, rain_column] = pop_rain( obs_hourly_verify[rain_column]) use_rain_max = True elif config['Model'][ 'rain_forecast_type'] == 'categorical' and not force_rain_quantity: if config['verbose']: print("verification: using 'categorical' rain") obs_hourly_verify.loc[:, rain_column] = categorical_rain( obs_hourly_verify[rain_column]) use_rain_max = True else: use_rain_max = False # Export final data export_dict = OrderedDict() for date in dates: try: day_dict = obs_daily.loc[date].to_dict(into=OrderedDict) except KeyError: continue if np.any(np.isnan(day_dict.values())): if config['verbose']: print('verification: warning: omitting day %s; missing data' % date) continue # No verification can have missing values if config['Model']['predict_timeseries']: start = pd.Timestamp(date + timedelta( hours=(config['forecast_hour_start'] - config['time_series_interval']))) end = pd.Timestamp(date + timedelta( hours=config['forecast_hour_start'] + 24)) try: series = reindex_hourly(obs_hourly_verify, start, end, config['time_series_interval'], use_rain_max=use_rain_max) except KeyError: # No values for the day if config['verbose']: print( 'verification: warning: omitting day %s; missing data' % date) continue if series.isnull().values.any(): if config['verbose']: print( 'verification: warning: omitting day %s; missing data' % date) continue series_dict = OrderedDict(series.to_dict(into=OrderedDict)) day_dict.update(series_dict) export_dict[date] = day_dict with open(output_file, 'wb') as handle: pickle.dump(export_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) return
def obs(config, output_file=None, num_hours=24, interval=3, use_nan_sounding=False, use_existing_sounding=True): """ Generates observation data from MesoWest and UCAR soundings and saves to a file, which can later be retrieved for either training data or model run data. :param config: :param output_file: str: output file path :param num_hours: int: number of hours to retrieve obs :param interval: int: retrieve obs every 'interval' hours :param use_nan_sounding: bool: if True, uses a sounding of NaNs rather than omitting a day if sounding is missing :param use_existing_sounding: bool: if True, preferentially uses saved soundings in sounding_data_dir :return: """ if output_file is None: output_file = '%s/%s_obs.pkl' % (config['SITE_ROOT'], config['station_id']) start_date = datetime.strptime(config['data_start_date'], '%Y%m%d') - timedelta(hours=num_hours) dates = generate_dates(config) api_dates = generate_dates(config, api=True, start_date=start_date) # Look for desired variables vars_request = [ 'air_temp', 'altimeter', 'precip_accum_one_hour', 'relative_humidity', 'wind_speed', 'wind_direction' ] # Add variables to the api request vars_api = '' for var in vars_request: vars_api += var + ',' vars_api = vars_api[:-1] # Units units = 'temp|f,precip|in,speed|kts' # Retrieve station data obs_hourly = get_obs_hourly(config, api_dates, vars_api, units) # Retrieve upper-air sounding data if config['verbose']: print('obs: retrieving upper-air sounding data') soundings = OrderedDict() if config['Obs']['use_soundings']: for date in dates: soundings[date] = OrderedDict() start_date = date - timedelta( days=1) # get the previous day's soundings for hour in [0, 12]: sounding_date = start_date + timedelta(hours=hour) try: sounding = upper_air(sounding_date, use_nan_sounding, use_existing=use_existing_sounding) soundings[date][sounding_date] = sounding except: print('obs: warning: problem retrieving soundings for %s' % datetime.strftime(date, '%Y%m%d')) soundings.pop(date) break # Create dictionary of days if config['verbose']: print('obs: converting to output dictionary') obs_export = OrderedDict({'SFC': OrderedDict(), 'SNDG': OrderedDict()}) for date in dates: if config['Obs']['use_soundings'] and date not in soundings.keys(): continue # Need to ensure we use the right intervals to have 22:5? Z obs start = pd.Timestamp((date - timedelta(hours=num_hours))) end = pd.Timestamp(date) obs_export['SFC'][date] = reindex_hourly( obs_hourly, start, end, interval, end_23z=True).to_dict(into=OrderedDict) if config['Obs']['use_soundings']: obs_export['SNDG'][date] = soundings[date] # Export final data if config['verbose']: print('obs: -> exporting to %s' % output_file) with open(output_file, 'wb') as handle: pickle.dump(obs_export, handle, protocol=pickle.HIGHEST_PROTOCOL) return