Пример #1
0
def load_dataframe_from_sql(river, limit=-1):
    """Load data from the database and return a pandas dataframe. 
    Limit param specifies number of rows returned. Default is to return all"""
    if limit > 0:
        logger.info(
            "loading df for river {river} from sql with row limit of {limit}".
            format(river=river, limit=limit))
    else:
        logger.info(
            "loading entire df for river {river} from sql".format(river=river))
    con = sqlite3.connect(DATABASE_PATH)
    cur = con.cursor()
    query = """
            SELECT timestamp, rain, level, forecast 
                from {river}
            ORDER BY timestamp DESC
            LIMIT {limit}
        """
    cur.execute(query.format(river=river, limit=limit))
    result = cur.fetchall()

    df = pd.DataFrame(result,
                      columns=['timestamp', 'cum_rain', 'level', 'forecast'])
    # # Set index to timestamp column as object
    df.timestamp = pd.to_datetime(df.timestamp)
    df = df.set_index('timestamp')
    df = df.sort_index()

    return df
Пример #2
0
def preprocessing(df):
    """Reindex to include missing timestamps and create new column for actual rain from cumulative rain"""
    logger.debug("Fill in missing timestamps by reindexing")

    min_time = min(df.index)
    max_time = max(df.index)

    rng = pd.date_range(min_time, max_time, freq='15Min')
    df = df.reindex(rng)

    logger.debug("Convert cumulative rain to actual rain")

    df['rain'] = df['cum_rain'].diff(periods=2)

    # negative values from diff are when the rain value resets so we set equal to the cumulative value
    df.loc[df['rain'] < 0, 'rain'] = df.loc[df['rain'] < 0, 'cum_rain']

    latest_rain_time = max(df.index[df.cum_rain.notnull()])
    logger.info('latest rain update at: ' + str(latest_rain_time))

    logger.debug("Concat rain and forecast to create model_rain")

    df['model_rain'] = pd.concat([
        df[df.index <= latest_rain_time]['rain'],
        df[df.index > latest_rain_time]['forecast']
    ])

    logger.debug("interpolate model_rain")

    df['model_rain'] = df['model_rain'].interpolate()

    return df
Пример #3
0
def run(testing):
    start_time = time.time()

    # # Load data from sql database into pandas df

    df = load_dataframe_from_sql(river="dart", limit=130)

    df = preprocessing(df)

    # # Calculate important timestamps

    # current time rounded down to nearest 15 minutes
    current_time = time.time()
    current_time = pd.to_datetime(current_time - (current_time % (15 * 60)),
                                  unit='s')
    logger.info('Current_time: ' + str(current_time))

    # # Pre-model checks
    pre_model_checks(df, current_time)

    # run model
    df = model(df)

    # create export
    output = model_export(df, current_time)

    # upload export
    upload_export(testing, output)
    logger.debug("---%s seconds --- taken to run model" %
                 (time.time() - start_time))
Пример #4
0
def model_export(df, current_time):
    # Round export columns
    df = df.round({'level': 3, 'predict': 3, 'model_rain': 1})

    try:
        current_row = df.loc[pd.to_datetime(current_time, unit='s')]
        current_level = current_row['level']
        if np.isnan(current_level):
            current_level = current_row['predict']
    except KeyError:
        print "Can't find row in df that matches current time: " + time.strftime(
            time_format, time.gmtime(current_time))
        current_level = None

    logger.info('currenct level: ' + str(current_level))

    df.timestamp = df.index
    df = df.where((pd.notnull(df)), None)
    timestamp_vals = [timestmp.value / 1000 for timestmp in df.index.tolist()]
    rain_vals = df.model_rain.tolist()
    level_vals = df.level.tolist()
    predict_vals = df.predict.tolist()
    values = []
    for n in range(0, len(timestamp_vals)):
        values.append({
            'timestamp': timestamp_vals[n],
            'rain': rain_vals[n],
            'level': level_vals[n],
            'predict': predict_vals[n]
        })

    if current_level > 1.5:
        text = "THE DART IS MASSIVE"
    elif current_level > 0.7:
        text = 'YES'
    else:
        next_up = df[(df.index > current_time)
                     & (df.index < current_time + delay) &
                     (df.predict > 0.7)].index.min()
        if pd.isnull(next_up):
            text = 'NO'
        else:
            text = "THE DART WILL BE UP SHORTLY"

    logger.info("OUTPUT TEXT: " + text)

    output = {}
    output['current_time'] = current_time.value / 1000
    output['current_level'] = current_level
    output['text'] = text
    output['values'] = values

    return output
Пример #5
0
def model(df, from_latest_level=True):
    logger.info('*** RUNNING MODEL ***')
    if from_latest_level:
        # for running model from latest level update onwards
        starting_time = max(df.index[df.level.notnull()])
    else:
        # for running model on entire dataframe
        starting_time = df.index[df.level.notnull()][5]
    starting_level = df.loc[starting_time].level

    logger.info('Run model from: ' + str(starting_time))
    logger.info('Starting level update: ' + str(starting_level))

    df['storage'] = np.nan
    df['predict'] = np.nan

    # Calculate initial storage
    init_storage = f_inv(g_inv(starting_level))
    df.loc[starting_time, 'storage'] = init_storage
    storage = init_storage

    # Run iteration for indexes > latest_level_update
    df_model = df[(df.index > pd.Timestamp(starting_time))]

    for i, r in df_model.iterrows():
        rain = df.loc[i - delay, 'model_rain']
        predict = g(f(storage))
        storage = storage + rain - f(storage)
        df.loc[i, 'storage'] = storage
        df.loc[i, 'predict'] = predict

    return df
Пример #6
0
def rnn_model(testing_mode, testing_timestamp):
    if testing_mode:
        current_time = pd.to_datetime(testing_timestamp)
        df = load_dataframe_from_sql(river=RIVER_NAME, limit=-1)
        df = df[df.index > current_time - pd.Timedelta('2days')]
        df = df[df.index < current_time + pd.Timedelta('1days')]
        df.loc[(df.index > current_time - pd.Timedelta('1days')),
               "level"] = None
        df.loc[(df.index > current_time - pd.Timedelta('30minutes')),
               "cum_rain"] = None

    else:
        current_time = time.time()
        current_time = pd.to_datetime(current_time - (current_time %
                                                      (15 * 60)),
                                      unit='s')
        df = load_dataframe_from_sql(river=RIVER_NAME, limit=130)
    logger.info("current_time: {value}".format(value=current_time))

    latest_level_update_timestamp = max(df[df.level.notnull()].index)
    latest_rain_time = max(df.index[df.cum_rain.notnull()])
    latest_forecast_rain_time = max(df.index[df.forecast.notnull()])
    logger.info("latest_level_update_timestamp: {value}".format(
        value=latest_level_update_timestamp))
    logger.info("latest_rain_time: {value}".format(value=latest_rain_time))
    logger.info("latest_forecast_rain_time: {value}".format(
        value=latest_forecast_rain_time))

    df = df[df.index <= latest_forecast_rain_time]

    # Fill in missing timestamps by reindexing
    min_time = min(df.index)
    max_time = max(df.index)
    rng = pd.date_range(min_time, max_time, freq='15Min')
    df = df.reindex(rng)

    num_level_updates = df[df.index <= latest_level_update_timestamp].shape[0]
    num_rain_updates = df[df.index <= latest_rain_time].shape[0]
    num_forecast_rain_updates = df[
        df.index <= latest_forecast_rain_time].shape[0]

    logger.info("num_level_updates: {value}".format(value=num_level_updates))
    logger.info("num_rain_updates: {value}".format(value=num_rain_updates))
    logger.info("num_forecast_rain_updates: {value}".format(
        value=num_forecast_rain_updates))

    # Remove rows after latest cum_rain value (no longer using forecast data)
    #df = df[df.index <= latest_rain_time]

    # Convert cumulative rain to actual rain
    df['rain'] = df['cum_rain'].diff(periods=2)

    # negative values from diff are when the rain value resets so we set equal to the cumulative value
    df.loc[df['rain'] < 0, 'rain'] = df.loc[df['rain'] < 0, 'cum_rain']

    df['model_rain'] = pd.concat((df[df.index <= latest_rain_time]["rain"],
                                  df[df.index > latest_rain_time]["forecast"]))

    # Interpolate model_rain

    df['model_rain'] = df['model_rain'].interpolate()
    df['model_rain'] = df['model_rain'].fillna(0)

    x = df.model_rain.values
    y = df.level.fillna(0).values
    timestamps = df.index.values

    update_vector = np.zeros(x.shape)
    update_vector[0:num_level_updates] = 1

    x = np.column_stack([x, update_vector, update_vector * y])
    y = np.column_stack([y])

    model_name = "production_rnn"
    path_to_model = os.path.join(FDIR, model_name)
    predict_fn = tf.contrib.predictor.from_saved_model(path_to_model)
    predict = predict_fn({"x": [x]})["predictions"]

    rain = np.concatenate(
        (x[:num_rain_updates,
           0], np.zeros(x.shape[0] - num_rain_updates) * np.nan))
    forecast = np.concatenate(
        (np.zeros(num_rain_updates) * np.nan, x[num_rain_updates:, 0]))
    level = y[:, 0]
    level[num_level_updates:] = None
    predict[:num_level_updates - 1] = None

    # create output json
    output_df = pd.DataFrame({
        "timestamp": timestamps,
        "rain": rain,
        "forecast": forecast,
        "level": level,
        "predict": predict
    })

    output_df = output_df.round({
        'level': 3,
        'predict': 3,
        'rain': 1,
        'forecast': 1
    })
    output_df = pd.DataFrame(output_df).replace({np.nan: None})

    if latest_level_update_timestamp == current_time:
        current_level = output_df[output_df.timestamp ==
                                  current_time]["level"].values[0]
    else:
        try:
            current_level = output_df[output_df.timestamp ==
                                      current_time]["predict"].values[0]
        except:
            current_level = None

    logger.info('currenct level: ' + str(current_level))

    if current_level is None:
        text = "?"
    elif current_level > MAXIMUM_THRESHOLD:
        text = "THE DART IS MASSIVE"
    elif current_level > MIMIMUM_THRESHOLD:
        text = 'YES'
    elif output_df[(output_df.timestamp > current_time)
                   & (output_df.timestamp <
                      (current_time + pd.Timedelta('1hours')))]["predict"].max(
                      ) > MIMIMUM_THRESHOLD:
        text = "THE DART WILL BE UP SHORTLY"
    else:
        text = 'NO'

    logger.info("OUTPUT TEXT: " + text)

    output_df.timestamp = [
        timestamp.value / 1000 for timestamp in output_df.timestamp.tolist()
    ]
    values = output_df.T.to_dict().values()

    output = {}
    output['current_time'] = current_time.value / 1000
    output['current_level'] = current_level
    output['text'] = text
    output['values'] = values
    output['broken'] = False
    return output