def readDaily(config, stid, data_binding, table_type, model=None, start_date=None, end_date=None, force_list=False): """ Read a Daily or list of Dailys from a specified data_binding at a certain station id and of a given table type. table_type must be 'verif', 'climo', 'daily_forecast', or something defined in the schema of data_binding as %(stid)_%(table_type).upper(). Model should be provided unless retrieving from verif or climo. If start_date and end_date are None, then then the start is set to now and the end to 24 hours in the future. If start_date only is None, then it is set to 24 hours before end_date. If end_date only is None, then it is set to 24 hours after start_date. :param config: :param stid: str: station ID :param data_binding: str: name of database binding to write to :param table_type: str: type of table :param model: str: model name :param start_date: datetime or str: starting date :param end_date: datetime or str: ending date :param force_list: bool: if True, returns a list even if there is only one Daily object :return: Daily or list of Dailys of requested data """ # Get the database and table names database = config['DataBinding'][data_binding]['database'] table = '%s_%s' % (stid.upper(), table_type.upper()) # Get data from _read data = _read(config, database, table, start_date=start_date, end_date=end_date, model=model) # Check that we have data if data is None: raise ValueError('db.readDaily error: no data retrieved.') # Generate Daily object(s) daily_list = [] for index in range(len(data.index)): row = data.iloc[index] daily = Daily(stid, date_to_datetime(row['DATETIME'])) daily.set_values(row['HIGH'], row['LOW'], row['WIND'], row['RAIN']) daily.model = model daily_list.append(daily) if len(data.index) == 0: raise ValueError('db.readDaily error: no data found.') elif len(data.index) > 1 or force_list: if config['debug'] > 9: print('db.readDaily: returning list of daily objects') return daily_list elif len(data.index) == 1: return daily_list[0]
def json_climo(config, stid, start_date): """ Produce a json file for verification values at a station starting at start_date and going to the latest available verification, and save it to file. """ climo = OrderedDict() end_date = datetime.utcnow() variables = ['high', 'low', 'wind', 'rain'] if config['debug'] > 9: print('web.json: retrieving climo for %s' % stid) dailys = [] current_date = start_date while current_date <= end_date: climo_date = current_date.replace(year=last_leap_year()) try: daily = readDaily(config, stid, 'forecast', 'climo', start_date=climo_date, end_date=climo_date) daily.date = current_date except MissingDataError: # missing climo data daily = Daily(stid, current_date) daily.set_values(np.nan, np.nan, np.nan, np.nan) dailys.append(daily) current_date += timedelta(days=1) for v in variables: climo[v.upper()] = [ getattr(dailys[j], v) if not (np.isnan(getattr(dailys[j], v))) else None for j in range(len(dailys)) ] climo['DATETIME'] = [ getattr(dailys[j], 'date').isoformat() + 'Z' for j in range(len(dailys)) ] return climo
def get_climo(config, stid, ghcn_stid, start_year=1980): """ Get climatological values as a list of Daily objects for the station ghcn_stid, with a climatology starting in start_year. There is no specified end year because wind data are limited. """ # Retrieve the data print('climo: fetching data for GHCN station %s' % ghcn_stid) ghcn = get_ghcn_data(ghcn_stid) # For each variable, use groupby to get yearly climo if config['debug'] > 9: print('climo: grouping data into yearly climatology') aggregate = {'value': np.mean} ghcn_yearly = {} if config['debug'] > 9: print('climo: averaging for years since %d' % start_year) for var, df in ghcn.items(): # Apparently values are "object" type. Convert to floats df['value'] = df['value'].astype(str).astype(np.float64) # Remove any data older than start year df = df[df.index > datetime(start_year, 1, 1)] ghcn_yearly[var] = df.groupby([df.index.month, df.index.day]).agg(aggregate) # Now we have dataframes with indices (month, day). We need to use the # nearest leap year to avoid confusion with Feb 29 year = 4 * (datetime.utcnow().year // 4) # Create a list of Dailys dailys = [] if config['debug'] > 50: print('climo: here are the values') for index, row in ghcn_yearly['TMAX'].iterrows(): date = datetime(year, index[0], index[1]) daily = Daily(stid, date) # We also need to convert units! Temp from 10ths of C to F, wind from # 10ths of m/s to kts, rain from 10ths of mm to in. daily.high = ghcn_yearly['TMAX'].loc[index][ 'value'] / 10. * 9. / 5. + 32. daily.low = ghcn_yearly['TMIN'].loc[index][ 'value'] / 10. * 9. / 5. + 32. daily.wind = ghcn_yearly['WSF2'].loc[index]['value'] / 10. * 1.94384 daily.rain = ghcn_yearly['PRCP'].loc[index]['value'] / 254. if config['debug'] > 50: print('%s %0.0f/%0.0f/%0.0f/%0.2f' % (daily.date, daily.high, daily.low, daily.wind, daily.rain)) dailys.append(daily) return dailys
def main(config): """ Main function. Runs the verification calculation. """ data_binding = 'forecast' # Figure out which days we are verifying for: up to yesterday. time_now = datetime.utcnow() - timedelta(days=1, hours=6) end_date = datetime(time_now.year, time_now.month, time_now.day) print('calcVerification: calculating statistics through %s' % end_date) start_date = end_date - timedelta(days=31) # The directory and archive file db_dir = '%s/archive' % config['THETAE_ROOT'] stats_file = '%s/theta-e-stats.json' % db_dir stats = OrderedDict() # Iterate over stations for stid in config['Stations'].keys(): if config['debug'] > 9: print('calcVerification: calculating statistics for station %s' % stid) # Load verification and climo data if config['debug'] > 50: print('calcVerification: loading verification and climo data') verification = readDaily(config, stid, data_binding, 'verif', start_date=start_date, end_date=end_date) climo = [] current_date = start_date while current_date <= end_date: climo_date = current_date.replace(year=last_leap_year()) try: climo_day = readDaily(config, stid, data_binding, 'climo', start_date=climo_date, end_date=climo_date) climo_day.date = current_date except ValueError: # missing climo data climo_day = Daily(stid, current_date) climo_day.set_values(np.nan, np.nan, np.nan, np.nan) climo.append(climo_day) current_date += timedelta(days=1) # Get persistence and convert to dictionaries persistence = OrderedDict() for v in verification: persistence[date_to_string(v.date + timedelta(days=1))] = v verification = list_to_dict(verification) climo = list_to_dict(climo) stats[stid] = OrderedDict() for model in list(config['Models'].keys()): if config['debug'] > 50: print('calcVerification: loading forecast data for %s' % model) try: forecasts = readDaily(config, stid, data_binding, 'daily_forecast', model=model, start_date=start_date + timedelta(days=1), end_date=end_date, force_list=True) forecasts = list_to_dict(forecasts) except ValueError: if config['debug'] > 9: print( 'calcVerification warning: no data found for model %s at %s' % (model, stid)) continue verif_days = [ d for d in forecasts.keys() if (d in verification.keys() and d in climo.keys() and d in persistence.keys()) ] # Get stats for each of the model, climo, and persistence. We do this for every model so that the skill # scores can be compared across different sets of available verification days for each model. if config['debug'] > 50: print('calcVerification: calculating statistics for %s' % model) model_stats = get_forecast_stats(forecasts, verification, day_list=verif_days) climo_stats = get_forecast_stats(climo, verification, day_list=verif_days) persist_stats = get_forecast_stats(persistence, verification, day_list=verif_days) # Add in the skill scores for var in ['high', 'low', 'wind', 'rain']: try: model_stats['stats'][var]['skillClimo'] = 1. - ( model_stats['stats'][var]['rmse'] / climo_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillClimo'] = None try: model_stats['stats'][var]['skillClimoNoBias'] = 1. - ( model_stats['stats'][var]['rmseNoBias'] / climo_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillClimoNoBias'] = None try: model_stats['stats'][var]['skillPersist'] = 1. - ( model_stats['stats'][var]['rmse'] / persist_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillPersist'] = None try: model_stats['stats'][var]['skillPersistNoBias'] = 1. - ( model_stats['stats'][var]['rmseNoBias'] / persist_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillPersistNoBias'] = None # Remove NaN (not interpreted by json) and add to the large dictionary replace_nan_in_dict(model_stats) stats[stid][model] = model_stats # Write to the file with open(stats_file, 'w') as f: json.dump(stats, f)
def get_verification(config, stid, start_dt, end_dt, use_climo=False, use_cf6=True): """ Generates verification data from MesoWest API. If use_climo is True, then fetch climate data from NCDC using ulmo to fill in wind values. (We probably generally don't want to do this, because it is slow and is delayed by 1-2 weeks from present.) If use_cf6 is True, then any CF6 files found in ~/site_data will be used for wind values. These files are retrieved by get_cf6_files. """ # MesoWest token and init meso_token = config['Verify']['api_key'] m = Meso(token=meso_token) # Look for desired variables vars_request = ['air_temp_low_6_hour', 'air_temp_high_6_hour', 'precip_accum_six_hour'] vars_api = ','.join(vars_request) # Units units = 'temp|f,precip|in,speed|kts' # Retrieve 6-hourly data start, end = meso_api_dates(start_dt, end_dt) print('verification: retrieving 6-hourly data from %s to %s' % (start, end)) obs = m.timeseries(stid=stid, start=start, end=end, vars=vars_api, units=units, hfmetars='0') obs_6hour = pd.DataFrame.from_dict(obs['STATION'][0]['OBSERVATIONS']) # Rename columns to requested vars. This changes the columns in the DataFrame to corresponding names in # vars_request, because otherwise the columns returned by MesoPy are weird. obs_var_names = obs['STATION'][0]['SENSOR_VARIABLES'] obs_var_keys = list(obs_var_names.keys()) col_names = list(map(''.join, obs_6hour.columns.values)) for c in range(len(col_names)): col = col_names[c] for k in range(len(obs_var_keys)): key = obs_var_keys[k] if col == list(obs_var_names[key].keys())[0]: col_names[c] = key obs_6hour.columns = col_names # Let's add a check here to make sure that we do indeed have all of the variables we want for var in vars_request + ['wind_speed']: if var not in col_names: obs_6hour = obs_6hour.assign(**{var: np.nan}) # Change datetime column to datetime object, subtract 6 hours to use 6Z days dateobj = pd.Index(pd.to_datetime(obs_6hour['date_time'])).tz_localize(None) - timedelta(hours=6) obs_6hour['date_time'] = dateobj datename = 'DATETIME' obs_6hour = obs_6hour.rename(columns={'date_time': datename}) # Now we're going to group the data into daily values. # Define an aggregation function for pandas groupby def day(dt): d = dt.iloc[0] return datetime(d.year, d.month, d.day) aggregate = {datename: day} aggregate['air_temp_high_6_hour'] = np.max aggregate['air_temp_low_6_hour'] = np.min aggregate['wind_speed'] = np.max aggregate['precip_accum_six_hour'] = np.sum # Now group by day. Note that we changed the time to subtract 6 hours, so days are nicely defined as 6Z to 6Z. if config['debug'] > 50: print('verification: grouping data by day') obs_daily = obs_6hour.groupby([pd.DatetimeIndex(obs_6hour[datename]).year, pd.DatetimeIndex(obs_6hour[datename]).month, pd.DatetimeIndex(obs_6hour[datename]).day]).agg(aggregate) # Now we check for wind values from the CF6 files, which are the actual verification if use_climo or use_cf6: if config['debug'] > 9: print('verification: checking climo and/or CF6 for wind data') climo_values = {} cf6_values = {} if use_climo: try: climo_values = _climo_wind(config, stid) except BaseException as e: print('verification warning: problem reading climo data') print("*** Reason: '%s'" % str(e)) if use_cf6: try: cf6_values = _cf6_wind(config, stid) except BaseException as e: print('verification warning: problem reading CF6 files') print("*** Reason: '%s'" % str(e)) climo_values.update(cf6_values) # CF6 overrides count_rows = 0 for index, row in obs_daily.iterrows(): date = row[datename] if date in climo_values.keys(): count_rows += 1 obs_wind = row['wind_speed'] cf6_wind = climo_values[date]['wind'] if obs_wind - cf6_wind >= 5: if config['debug'] > 9: print('verification warning: obs wind for %s (%0.0f) much larger than ' 'cf6/climo wind (%0.0f); using obs' % (date, obs_wind, cf6_wind)) else: obs_daily.loc[index, 'wind_speed'] = cf6_wind if config['debug'] > 9: print('verification: found %d matching rows for wind' % count_rows) # Rename the columns obs_daily.rename(columns={'air_temp_high_6_hour': 'high'}, inplace=True) obs_daily.rename(columns={'air_temp_low_6_hour': 'low'}, inplace=True) obs_daily.rename(columns={'wind_speed': 'wind'}, inplace=True) obs_daily.rename(columns={'precip_accum_six_hour': 'rain'}, inplace=True) # For hourly data, retrieve the data from the database. Only if the database returns an error do we retrieve data # from MesoWest. try: obs_hour = readTimeSeries(config, stid, 'forecast', 'OBS', start_date=start_dt, end_date=end_dt).data except MissingDataError: if config['debug'] > 9: print('verification: missing data in db for hourly obs; retrieving from MesoWest') obs_hour = get_obs(config, stid, start, end).data # Set DateTime column and round precipitation to avoid trace accumulations dateobj = pd.Index(pd.to_datetime(obs_hour[datename])).tz_localize(None) - timedelta(hours=6) obs_hour[datename] = dateobj obs_hour['RAINHOUR'] = obs_hour['RAINHOUR'].round(2) aggregate = {datename: day} aggregate['TEMPERATURE'] = {'high': np.max, 'low': np.min} aggregate['WINDSPEED'] = {'wind': np.max} aggregate['RAINHOUR'] = {'rain': np.sum} obs_hour.index = obs_hour['DATETIME'] obs_hour_day = pd.DataFrame(columns = ['high','low','wind','rain']) obs_hour_day['high'] = obs_hour['TEMPERATURE'].resample('1D').max() obs_hour_day['low'] = obs_hour['TEMPERATURE'].resample('1D').min() obs_hour_day['wind'] = obs_hour['WINDSPEED'].resample('1D').max() obs_hour_day['rain'] = obs_hour['RAINHOUR'].resample('1D').sum() obs_daily.index = obs_daily['DATETIME'] obs_daily.drop('DATETIME', axis=1, inplace=True) # Compare the daily to hourly values obs_daily['high'] = np.fmax(obs_daily['high'], obs_hour_day['high']) obs_daily['low'] = np.fmin(obs_daily['low'], obs_hour_day['low']) obs_daily['wind'] = np.fmax(obs_daily['wind'], obs_hour_day['wind']) obs_daily['rain'] = np.fmax(obs_daily['rain'], obs_hour_day['rain']) # Make sure rain has no missing values rather than zeros. Resample appropriately dealt with missing values earlier. obs_daily['rain'].fillna(0.0, inplace=True) # Round values to nearest degree, knot, and centi-inch obs_daily = obs_daily.round({'high': 0, 'low': 0, 'wind': 0, 'rain': 2}) # Lastly, place all the values we found into a list of Daily objects. # Remove extraneous columns export_cols = ['high', 'low', 'wind', 'rain'] for col in obs_daily.columns: if col not in export_cols: obs_daily.drop(col, axis=1, inplace=True) # Create list of Daily objects dailys = [] if config['debug'] > 50: print('verification: here are the values') for index, row in obs_daily.iterrows(): date = index.to_pydatetime() daily = Daily(stid, date) for attr in export_cols: setattr(daily, attr, row[attr]) if config['debug'] > 50: print('%s %0.0f/%0.0f/%0.0f/%0.2f' % (daily.date, daily.high, daily.low, daily.wind, daily.rain)) dailys.append(daily) return dailys
def get_verification(config, stid, start, end, use_climo=False, use_cf6=True): """ Generates verification data from MesoWest API. If use_climo is True, then fetch climate data from NCDC using ulmo to fill in wind values. (We probably generally don't want to do this, because it is slow and is delayed by 1-2 weeks from present.) If use_cf6 is True, then any CF6 files found in ~/site_data will be used for wind values. These files are retrieved by get_cf6_files. """ # MesoWest token and init meso_token = config['Verify']['api_key'] m = Meso(token=meso_token) if config['debug'] > 9: print('verification: MesoPy initialized for station %s' % stid) # Look for desired variables vars_request = ['air_temp', 'wind_speed', 'precip_accum_one_hour'] vars_option = [ 'air_temp_low_6_hour', 'air_temp_high_6_hour', 'precip_accum_six_hour' ] # Add variables to the api request if they exist if config['debug'] > 50: print('verification: searching for 6-hourly variables') latest = m.latest(stid=stid) obs_list = list(latest['STATION'][0]['SENSOR_VARIABLES'].keys()) for var in vars_option: if var in obs_list: if config['debug'] > 9: print('verification: using variable %s' % var) vars_request += [var] vars_api = ','.join(vars_request) # Units units = 'temp|f,precip|in,speed|kts' # Retrieve data print('verification: retrieving data from %s to %s' % (start, end)) obs = m.timeseries(stid=stid, start=start, end=end, vars=vars_api, units=units) obspd = pd.DataFrame.from_dict(obs['STATION'][0]['OBSERVATIONS']) # Rename columns to requested vars. This changes the columns in the DataFrame to corresponding names in # vars_request, because otherwise the columns returned by MesoPy are weird. obs_var_names = obs['STATION'][0]['SENSOR_VARIABLES'] obs_var_keys = list(obs_var_names.keys()) col_names = list(map(''.join, obspd.columns.values)) for c in range(len(col_names)): col = col_names[c] for k in range(len(obs_var_keys)): key = obs_var_keys[k] if col == list(obs_var_names[key].keys())[0]: col_names[c] = key obspd.columns = col_names # Let's add a check here to make sure that we do indeed have all of the variables we want for var in vars_request: if var not in col_names: obspd = obspd.assign(**{var: np.nan}) # Change datetime column to datetime object, subtract 6 hours to use 6Z days dateobj = pd.Index(pd.to_datetime( obspd['date_time'])).tz_localize(None) - timedelta(hours=6) obspd['date_time'] = dateobj datename = 'DATETIME' obspd = obspd.rename(columns={'date_time': datename}) # Now we're going to group the data into daily values. First, we group by hour to be sure we have the right # precipitation accumulations, which are officially recorded by hour. def hour(dates): date = dates.iloc[0] return datetime(date.year, date.month, date.day, date.hour) # Define an aggregation function for pandas groupby aggregate = {datename: hour} if 'air_temp_high_6_hour' in vars_request and 'air_temp_low_6_hour' in vars_request: aggregate['air_temp_high_6_hour'] = np.max aggregate['air_temp_low_6_hour'] = np.min aggregate['air_temp'] = {'air_temp_max': np.max, 'air_temp_min': np.min} if 'precip_accum_six_hour' in vars_request: aggregate['precip_accum_six_hour'] = np.max aggregate['wind_speed'] = np.max aggregate['precip_accum_one_hour'] = np.max if config['debug'] > 50: print('verification: grouping data by hour') obs_hourly = obspd.groupby([ pd.DatetimeIndex(obspd[datename]).year, pd.DatetimeIndex(obspd[datename]).month, pd.DatetimeIndex(obspd[datename]).day, pd.DatetimeIndex(obspd[datename]).hour ]).agg(aggregate) # Rename columns col_names = obs_hourly.columns.values col_names_new = [] for c in range(len(col_names)): if col_names[c][0] == 'air_temp': col_names_new.append(col_names[c][1]) else: col_names_new.append(col_names[c][0]) obs_hourly.columns = col_names_new # Now group by day. Note that we changed the time to subtract 6 hours, so days are nicely defined as 6Z to 6Z. def day(dates): date = dates.iloc[0] return datetime(date.year, date.month, date.day) aggregate[datename] = day aggregate['air_temp_min'] = np.min aggregate['air_temp_max'] = np.max aggregate['precip_accum_six_hour'] = np.sum try: aggregate.pop('air_temp') except: pass if config['debug'] > 50: print('verification: grouping data by day') obs_daily = obs_hourly.groupby([ pd.DatetimeIndex(obs_hourly[datename]).year, pd.DatetimeIndex(obs_hourly[datename]).month, pd.DatetimeIndex(obs_hourly[datename]).day ]).agg(aggregate) # Now we check for wind values from the CF6 files, which are the actual verification if use_climo or use_cf6: if config['debug'] > 9: print('verification: checking climo and/or CF6 for wind data') climo_values = {} cf6_values = {} if use_climo: try: climo_values = _climo_wind(config, stid) except BaseException as e: print('verification warning: problem reading climo data') print("*** Reason: '%s'" % str(e)) if use_cf6: try: cf6_values = _cf6_wind(config, stid) except BaseException as e: print('verification warning: problem reading CF6 files') print("*** Reason: '%s'" % str(e)) climo_values.update(cf6_values) # CF6 overrides count_rows = 0 for index, row in obs_daily.iterrows(): date = row[datename] if date in climo_values.keys(): count_rows += 1 obs_wind = row['wind_speed'] cf6_wind = climo_values[date]['wind'] if obs_wind - cf6_wind >= 5: if config['debug'] > 9: print( 'verification warning: obs wind for %s (%0.0f) much larger than ' 'cf6/climo wind (%0.0f); using obs' % (date, obs_wind, cf6_wind)) else: obs_daily.loc[index, 'wind_speed'] = cf6_wind if config['debug'] > 9: print('verification: found %d matching rows for wind' % count_rows) # Round values to nearest degree, knot, and centi-inch round_dict = {'wind_speed': 0} if 'air_temp_high_6_hour' in vars_request: round_dict['air_temp_high_6_hour'] = 0 if 'air_temp_low_6_hour' in vars_request: round_dict['air_temp_low_6_hour'] = 0 round_dict['air_temp_max'] = 0 round_dict['air_temp_min'] = 0 if 'precip_accum_six_hour' in vars_request: round_dict['precip_accum_six_hour'] = 2 round_dict['precip_accum_one_hour'] = 2 obs_daily = obs_daily.round(round_dict) # Lastly, place all the values we found into a list of Daily objects. Rename the columns and then iterate over rows. if 'air_temp_high_6_hour' in vars_request: obs_daily.rename(columns={'air_temp_high_6_hour': 'high'}, inplace=True) else: obs_daily.rename(columns={'air_temp_max': 'high'}, inplace=True) if 'air_temp_low_6_hour' in vars_request: obs_daily.rename(columns={'air_temp_low_6_hour': 'low'}, inplace=True) else: obs_daily.rename(columns={'air_temp_min': 'low'}, inplace=True) if 'precip_accum_six_hour' in vars_request: obs_daily.rename(columns={'precip_accum_six_hour': 'rain'}, inplace=True) else: obs_daily.rename(columns={'precip_accum_one_hour': 'rain'}, inplace=True) obs_daily.rename(columns={'wind_speed': 'wind'}, inplace=True) # Make sure rain has no missing values rather than zeros. Groupby appropriately dealt with missing values earlier. obs_daily['rain'].fillna(0.0, inplace=True) # Set datetime as the index. This will help use datetime in the creation of the Dailys. obs_daily = obs_daily.set_index(datename) # Remove extraneous columns export_cols = ['high', 'low', 'wind', 'rain'] for col in obs_daily.columns: if col not in export_cols: obs_daily.drop(col, axis=1, inplace=True) # Create list of Daily objects dailys = [] if config['debug'] > 50: print('verification: here are the values') for index, row in obs_daily.iterrows(): date = index.to_pydatetime() daily = Daily(stid, date) for attr in export_cols: setattr(daily, attr, row[attr]) if config['debug'] > 50: print('%s %0.0f/%0.0f/%0.0f/%0.2f' % (daily.date, daily.high, daily.low, daily.wind, daily.rain)) dailys.append(daily) return dailys