def upload_data_from_file(csv_file, from_csv=False): """ insert all records contained in file to database Args: csv_file (str): full path of CSV file containing records from_csv (bool): whether to insert into database using CSV or ORM (CSV scales better) Returns: bool: success/exception """ r = Repository() if from_csv: success = r.put_measurements_from_csv(csv_file=csv_file) else: measurements = [] with open(csv_file, "r") as f: for line in f: site_id, param_code, date_time, value = line.strip().split(",") measurement = Measurement( station_id=site_id, metric_id=param_code, date_time=dateutil.parser.parse(date_time), value=float(value) ) measurements.append(measurement) success = r.put_measurements_from_list(measurements=measurements) return success
def put_24hr_observations(session): """get yesterdays observations Args session (Session): database session """ # create a repo and pull all the weather stations from NOAA repo = Repository(session) stations = repo.get_all_stations(source='NOAA') # setup the day to retrieve yesterday = dt.datetime.now() - dt.timedelta(hours=24) yesterday = dt.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day) # apply the api request to each station content = stations.apply( lambda station: make_station_observation_request(station, yesterday.isoformat()), axis=1 ).values # put them all in the db added = 0 for station_measurements in content: repo.put_measurements_from_list(station_measurements) added += len(station_measurements) return added
def get_usgs_site_ids(): """ retrieve USGS site ids from database Returns: [str]: list of site ids """ r = Repository() sites = r.get_all_stations(source="USGS") site_ids = [s for s in sites["station_id"]] return site_ids
def compute_station_river_distances(): """compute the distance from every river to every weather station""" repo = Repository() runs = repo.get_all_runs() stations = repo.get_all_stations() # foreach run, find the close USGS, NOAA, and SNOW station for run in runs.iterrows(): distances = stations.apply(lambda row: get_distance_between_geo_points( run[1].put_in_latitude, run[1].put_in_longitude, row.latitude, row. longitude, run[1].run_id, row.station_id, row.source), axis=1).apply(pd.Series) distances.sort_values('distance', inplace=True) usgs_ = distances[distances.source == 'USGS'].iloc[0, :] noaa_ = distances[distances.source == 'NOAA'].iloc[0, :] snow_ = distances[distances.source == 'SNOW'].iloc[0, :] usgs = StationRiverDistance(station_id=usgs_.station, run_id=run[1].run_id, distance=round(float(usgs_.distance), 2)) noaa = StationRiverDistance(station_id=noaa_.station, run_id=run[1].run_id, distance=round(float(noaa_.distance), 2)) snow = StationRiverDistance(station_id=snow_.station, run_id=run[1].run_id, distance=round(float(snow_.distance), 2)) repo.put_station_river_distances([usgs, noaa, snow])
def get_noaa_predictions(run_id, session): """retrieve NOAA predictions for run Args run_id (int): run session (Session): database session Returns DataFrame: containing predictions """ repo = Repository(session) run = repo.get_run(run_id) lat = run.put_in_latitude lon = run.put_in_longitude r = requests.get(f'https://api.weather.gov/points/{lat},{lon}/forecast/hourly') if r.status_code == 200 and len(r.content) > 10: return pd.DataFrame(r.json()['properties']['periods']) else: return None
def fill_noaa_gaps(start_date, end_date, db=settings.DATABASE): """use as needed to fill gaps in weather measurements Args: start_date: the start day, included in API calls end_date: the end day, inclusive """ context = Context(db) session = context.Session() repo = Repository(session) stations = repo.get_all_stations(source='NOAA') total = 0 # loop through each day retrieving observations while start_date <= end_date: content = stations.apply( lambda station: make_station_observation_request(station, start_date.isoformat()), axis=1 ).values # put them all in the db added = 0 for station_measurements in content: try: repo.put_measurements_from_list(station_measurements) except SQLAlchemyError: session.rollback() continue added += len(station_measurements) station = station_measurements[0].station print(f'added {added} measurements for station_id {station_measurements} - {start_date.isoformat()}') start_date += dt.timedelta(days=1) total += added return total
def setUpClass(cls): """perform at test class initialization Note: * ensure only a TContext is used NEVER Context or we'll lose all our hard-scraped data * any existing data in the mock db will be deleted * 5 random addresses are generated because nearly all unittests require addresses to exist as a foreign key dependency """ cls.context = TContext() cls.session = cls.context.Session() cls.connection = psycopg2.connect(**settings.PSYCOPG_DB_TEST) cls.repo = Repository(session=cls.session, connection=cls.connection) cls.context.clear_dependency_data(cls.session) cls.context.generate_addresses(cls.session)
def compute_predictions(session): """compute and cache predictions for all runs Args: session: (Session) database connection Returns: True: if observations were successfully retrieved and inserted False: otherwise """ try: arima = Arima(session) repo = Repository(session) runs = repo.get_all_runs_as_list() for run in runs: try: predictions = arima.arima_model(run.run_id) to_add = [ Prediction(run_id=run.run_id, timestamp=pd.to_datetime(d), fr_lb=round(float(p), 1), fr=round(float(p), 1), fr_ub=round(float(p), 1)) for p, d in zip(predictions.values, predictions.index.values) ] repo.clear_predictions(run.run_id) repo.put_predictions(to_add) log(f'predictions for {run.run_id}-{run.run_name} added to db') except SQLAlchemyError as e: log(f'{run.run_id}-{run.run_name} failed - {[str(a) for a in e.args]}' ) session.rollback() except Exception as e: log(f'predictions for {run.run_id}-{run.run_name} failed - {[str(a) for a in e.args]}' ) return True except Exception as e: log(f'failed to compute daily predictions - {str(e.args)}') return False
def __init__(self, session): self.repo = Repository(session)
class Arima: """ Creates predictions for future flow rate using ARIMA model Args: session: (Session) db session """ def __init__(self, session): self.repo = Repository(session) def get_data(self, run_id, metric_ids=None): """Retrieves data for selected run from database for past four years from current date using Repository.get_measurements function. Args: run_id (int): id of run for which model will be created metric_ids ([str]) - optional: list of metric ids to include Returns: DataFrame: containing four years of measurements up to current date for the given run """ now = datetime.datetime.now() end = datetime.datetime(now.year, now.month, now.day) start = end - datetime.timedelta(days=4 * 365) test_measures = self.repo.get_measurements(run_id=run_id, start_date=start, end_date=end, metric_ids=metric_ids) return test_measures def daily_avg(self, run_id): """Creates dataframe needed for modelling Calls Arima.get_data to retrieve measurements for given run and creates a dataframe with daily averages for flow rate and exogenous predictors. Args: run_id (int): id of run for which model will be created Returns: DataFrame: containing daily measurements """ time_series = self.get_data(run_id=run_id, metric_ids=['00003', '00060', '00001']) if len(time_series) == 0: return None precip = time_series[time_series.metric_id == '00003'] precip['date_time'] = pd.to_datetime(precip['date_time'], utc=True) precip.index = precip['date_time'] precip_daily = precip.resample('D').sum() flow = time_series[time_series.metric_id == '00060'] flow['date_time'] = pd.to_datetime(flow['date_time'], utc=True) flow.index = flow['date_time'] flow_daily = flow.resample('D').mean() temp = time_series[time_series.metric_id == '00001'] temp['date_time'] = pd.to_datetime(temp['date_time'], utc=True) temp.index = temp['date_time'] temp_daily = temp.resample('D').mean() time_series_daily = temp_daily\ .merge(flow_daily, how='inner', left_index=True, right_index=True) \ .merge(precip_daily, how='inner', left_index=True, right_index=True) time_series_daily.columns = ['temp', 'flow', 'precip'] time_series_daily = time_series_daily.dropna() return time_series_daily def arima_model(self, run_id): """Creates flow rate predictions using ARIMA model. Calls Arima.daily_avg to retrieve data for given run, then creates flow rate predictions by using statsmodels functions arma_order_select_ic and ARIMA. Three weeks of past flow rate data are also returned for plotting purposes. Args: run_id (int): id of run for which model will be created Returns: DataFrame: containing time-series flow rate predictions for next 7 days and historical flow rate for past 21 days """ # Retrieve data for modelling measures = self.daily_avg(run_id) # don't try to compute if there aren't any measures if measures is None: return pd.DataFrame() # Take past 7-day average of exogenous predictors to use for # future prediction exog_future_predictors = pd.concat( [measures.iloc[-7:, :].mean(axis=0).to_frame().T] * 7, ignore_index=True) try: # Find optimal order for model params = arma_order_select_ic(measures['flow'], ic='aic') try: # Build and fit model mod = ARIMA(measures['flow'], order=(params.aic_min_order[0], 0, params.aic_min_order[1]), exog=measures[['temp', 'precip']]).fit() prediction = pd.DataFrame([ mod.forecast( steps=7, exog=exog_future_predictors[['temp', 'precip']], alpha=0.05)[0] ]).T except Exception: # If model doesn't converge, return "prediction" # of most recent day prediction = pd.concat([measures.iloc[-1, :].to_frame().T] * 7, ignore_index=True)['flow'] except ValueError: # If order fitting doesn't converge, return "prediction" # of most recent day prediction = pd.concat([measures.iloc[-1, :].to_frame().T] * 7, ignore_index=True)['flow'] # Add dates and return past 21 days for plotting prediction_dates = [ measures.index[-2] + datetime.timedelta(days=x) for x in range(0, 7) ] prediction.index = prediction_dates past = measures['flow'][-22:-1] prediction = pd.concat([past[:-1], prediction], axis=0) return prediction def get_min_max(self, run_id): """Gets min and max runnable flow rate for river run to use for plots Args: run_id: id of run for which model will be created Returns: levels: minimum and maximum runnable flow rate for river """ runs = self.repo.get_all_runs() levels = runs[['min_level', 'max_level']][runs['run_id'] == run_id] return levels
from riverrunner.repository import Repository from riverrunner import settings # IP address for running application HOST_IP = '192.168.80.13' # enable for application debugging features DEBUG = False # mapping from river's predicted status to a color code COLOR_MAP = dict(unknown='#41434C', optimal='#4254CC', fair='#8F8A18', not_recommended='#A63617') repo = Repository() runs = repo.get_all_runs_as_list() runs = [run for run in runs if run.todays_runability != -2] options = [r.select_option for r in runs] options.sort(key=lambda r: r['label']) # create a new Dash app adding custom fonts and CSS app = dash.Dash() font_url = 'https://fonts.googleapis.com/css?family=Montserrat|Permanent+Marker' app.css.append_css({'external_url': font_url}) def color_scale(x): """prediction binning method bins river predictions into discrete categories for color coding
test_model: runs stationarity tests and acf/pcf tests and then creates ARIMA model for one run and plots results """ import datetime import matplotlib.pyplot as plt import numpy as np import pandas as pd from statsmodels.tsa.arima_model import ARIMA from statsmodels.tsa.stattools import acf, pacf from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.stattools import arma_order_select_ic from riverrunner.repository import Repository REPO = Repository() def daily_avg(time_series): """Creates dataframe needed for modelling Takes time series with measurements on different timeframes and creates a dataframe with daily averages for flow rate and exogenous predictors. Args: time_series: dataframe with metrics for one run_id, assumes output from get_measurements function Returns: DataFrame: containing daily measurements """