def get_climo(config, stid, ghcn_stid, start_year=1980): """ Get climatological values as a list of Daily objects for the station ghcn_stid, with a climatology starting in start_year. There is no specified end year because wind data are limited. """ # Retrieve the data print('climo: fetching data for GHCN station %s' % ghcn_stid) ghcn = get_ghcn_data(ghcn_stid) # For each variable, use groupby to get yearly climo if config['debug'] > 9: print('climo: grouping data into yearly climatology') aggregate = {'value': np.mean} ghcn_yearly = {} if config['debug'] > 9: print('climo: averaging for years since %d' % start_year) for var, df in ghcn.items(): # Apparently values are "object" type. Convert to floats df['value'] = df['value'].astype(str).astype(np.float64) # Remove any data older than start year df = df[df.index > datetime(start_year, 1, 1)] ghcn_yearly[var] = df.groupby([df.index.month, df.index.day]).agg(aggregate) # Now we have dataframes with indices (month, day). We need to use the # nearest leap year to avoid confusion with Feb 29 year = last_leap_year() # Create a list of Dailys dailys = [] if config['debug'] > 50: print('climo: here are the values') for index, row in ghcn_yearly['TMAX'].iterrows(): date = datetime(year, index[0], index[1]) daily = Daily(stid, date) # We also need to convert units! Temp from 10ths of C to F, wind from # 10ths of m/s to kts, rain from 10ths of mm to in. daily.high = ghcn_yearly['TMAX'].loc[index]['value'] / 10. * 9. / 5. + 32. daily.low = ghcn_yearly['TMIN'].loc[index]['value'] / 10. * 9. / 5. + 32. daily.wind = ghcn_yearly['WSF2'].loc[index]['value'] / 10. * 1.94384 daily.rain = ghcn_yearly['PRCP'].loc[index]['value'] / 254. if config['debug'] > 50: print('%s %0.0f/%0.0f/%0.0f/%0.2f' % (daily.date, daily.high, daily.low, daily.wind, daily.rain)) dailys.append(daily) return dailys
def json_climo(config, stid, start_date): """ Produce a json file for verification values at a station starting at start_date and going to the latest available verification, and save it to file. """ climo = OrderedDict() end_date = datetime.utcnow() variables = ['high', 'low', 'wind', 'rain'] if config['debug'] > 9: print('web.json: retrieving climo for %s' % stid) dailys = [] current_date = start_date while current_date <= end_date: climo_date = current_date.replace(year=last_leap_year()) try: daily = readDaily(config, stid, 'forecast', 'climo', start_date=climo_date, end_date=climo_date) daily.date = current_date except MissingDataError: # missing climo data daily = Daily(stid, current_date) daily.set_values(np.nan, np.nan, np.nan, np.nan) dailys.append(daily) current_date += timedelta(days=1) for v in variables: climo[v.upper()] = [ getattr(dailys[j], v) if not (np.isnan(getattr(dailys[j], v))) else None for j in range(len(dailys)) ] climo['DATETIME'] = [ getattr(dailys[j], 'date').isoformat() + 'Z' for j in range(len(dailys)) ] return climo
def main(config): """ Main function. Runs the verification calculation. """ data_binding = 'forecast' # Figure out which days we are verifying for: up to yesterday. time_now = datetime.utcnow() - timedelta(days=1, hours=6) end_date = datetime(time_now.year, time_now.month, time_now.day) print('calcVerification: calculating statistics through %s' % end_date) start_date = end_date - timedelta(days=31) # The directory and archive file db_dir = '%s/archive' % config['THETAE_ROOT'] stats_file = '%s/theta-e-stats.json' % db_dir stats = OrderedDict() # Iterate over stations for stid in config['Stations'].keys(): if config['debug'] > 9: print('calcVerification: calculating statistics for station %s' % stid) # Load verification and climo data if config['debug'] > 50: print('calcVerification: loading verification and climo data') verification = readDaily(config, stid, data_binding, 'verif', start_date=start_date, end_date=end_date) climo = [] current_date = start_date while current_date <= end_date: climo_date = current_date.replace(year=last_leap_year()) try: climo_day = readDaily(config, stid, data_binding, 'climo', start_date=climo_date, end_date=climo_date) climo_day.date = current_date except ValueError: # missing climo data climo_day = Daily(stid, current_date) climo_day.set_values(np.nan, np.nan, np.nan, np.nan) climo.append(climo_day) current_date += timedelta(days=1) # Get persistence and convert to dictionaries persistence = OrderedDict() for v in verification: persistence[date_to_string(v.date + timedelta(days=1))] = v verification = list_to_dict(verification) climo = list_to_dict(climo) stats[stid] = OrderedDict() for model in list(config['Models'].keys()): if config['debug'] > 50: print('calcVerification: loading forecast data for %s' % model) try: forecasts = readDaily(config, stid, data_binding, 'daily_forecast', model=model, start_date=start_date + timedelta(days=1), end_date=end_date, force_list=True) forecasts = list_to_dict(forecasts) except ValueError: if config['debug'] > 9: print( 'calcVerification warning: no data found for model %s at %s' % (model, stid)) continue verif_days = [ d for d in forecasts.keys() if (d in verification.keys() and d in climo.keys() and d in persistence.keys()) ] # Get stats for each of the model, climo, and persistence. We do this for every model so that the skill # scores can be compared across different sets of available verification days for each model. if config['debug'] > 50: print('calcVerification: calculating statistics for %s' % model) model_stats = get_forecast_stats(forecasts, verification, day_list=verif_days) climo_stats = get_forecast_stats(climo, verification, day_list=verif_days) persist_stats = get_forecast_stats(persistence, verification, day_list=verif_days) # Add in the skill scores for var in ['high', 'low', 'wind', 'rain']: try: model_stats['stats'][var]['skillClimo'] = 1. - ( model_stats['stats'][var]['rmse'] / climo_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillClimo'] = None try: model_stats['stats'][var]['skillClimoNoBias'] = 1. - ( model_stats['stats'][var]['rmseNoBias'] / climo_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillClimoNoBias'] = None try: model_stats['stats'][var]['skillPersist'] = 1. - ( model_stats['stats'][var]['rmse'] / persist_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillPersist'] = None try: model_stats['stats'][var]['skillPersistNoBias'] = 1. - ( model_stats['stats'][var]['rmseNoBias'] / persist_stats['stats'][var]['rmse']) except KeyError: model_stats['stats'][var]['skillPersistNoBias'] = None # Remove NaN (not interpreted by json) and add to the large dictionary replace_nan_in_dict(model_stats) stats[stid][model] = model_stats # Write to the file with open(stats_file, 'w') as f: json.dump(stats, f)
def init(config, reset_old=False, no_climo=False): """ Initializes new station IDs in the databases. Returns a list of all sites included in config that require historical data to be retrieved. Also creates a database if it does not exist. :param config: :param reset_old: if True, erases tables if they are too old :param no_climo: if True, does not check "CLIMO" tables """ add_sites = [] for data_binding in config['DataBinding'].keys(): # Open the database and schema schema_name = config['DataBinding'][data_binding]['schema'] database = config['DataBinding'][data_binding]['database'] schema = get_object(schema_name).schema conn = connection(config, database) if conn is None: raise IOError('Error: db.init cannot connect to database %s' % database) cursor = conn.cursor() # Iterate through stations in the config for stid in config['Stations'].keys(): add_site = False # Find the tables in the db and requested by the schema schema_table_names = ['%s_%s' % (stid.upper(), key) for key in schema.keys()] schema_table_structures = list(schema.values()) # Schema must have primary (datetime) key listed first date_keys = [schema[key][0][0] for key in schema.keys()] if config['debug'] > 50: print('db.init: found the following tables in schema:') print(schema_table_names) cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") sql_table_names = [table[0] for table in cursor.fetchall()] if config['debug'] > 50: print('db.init: found the following tables in sql db:') print(sql_table_names) # For each requested table, create it if it doesn't exist for t, table in enumerate(schema_table_names): if no_climo and 'CLIMO' in table.upper(): if config['debug'] > 9: print('db.init: ignoring table %s' % table) continue if not (table in sql_table_names): # Something was missing, so we need to add the site to the output list add_site = True # A string of all table columns and types if config['debug'] > 0: print('db.init: need to create table %s' % table) sqltypestr = ', '.join(["%s %s" % _type for _type in schema_table_structures[t]]) cursor.execute("CREATE TABLE %s (%s);" % (table, sqltypestr,)) else: # Check if data in table are recent time_now = datetime.utcnow() if table != stid.upper() + '_CLIMO': recent = timedelta(days=30) else: recent = time_now - datetime(last_leap_year(time_now), 12, 31) key = date_keys[t] try: cursor.execute("SELECT %s FROM %s ORDER BY %s DESC LIMIT 1;" % (key, table, key)) last_dt = date_to_datetime(cursor.fetchone()[0]) except: last_dt = None if last_dt is None or (time_now - last_dt > recent): # Old or missing data, drop table and recreate it add_site = True if reset_old: if config['debug'] > 0: print('db.init: %s table too old, resetting it' % table) cursor.execute("DROP TABLE %s;" % table) sqltypestr = ', '.join(["%s %s" % _type for _type in schema_table_structures[t]]) cursor.execute("CREATE TABLE %s (%s);" % (table, sqltypestr,)) else: if config['debug'] > 0: print('db.init: %s table is old, adding to historical' % table) # Lastly, add the site if we need to rerun historical data if add_site and stid not in add_sites: add_sites.append(stid) elif config['debug'] > 0: print('db.init: nothing to do for station %s' % stid) conn.close() return add_sites