def main():
    # LOAD DATA
    dir = 'data'
    data = load_from_pickle(dir, 'data_daily.pkl')

    # SPLIT DATA
    # using standard weeks sunday - saturday
    # first sunday in dataset:
    day1_train = '2006-12-17'
    # first sunday in 2010:
    day1_test = '2010-1-3'
    # last saturday in dataset:
    day_last_test = '2010-11-20'

    data_train = data[data.index >= day1_train]
    data_train = data_train[data_train.index < day1_test]
    data_test = data[data.index >= day1_test]
    data_test = data_test[data_test.index <= day_last_test]

    # METRICS
    ndays_train = data_train.shape[0]
    ndays_test = data_test.shape[0]
    print('\ntraining set duration: {} days, {} weeks'.format(ndays_train, ndays_train/7))
    print('test set duration: {} days, {} weeks\n'.format(ndays_test, ndays_test/7))

    save_to_pickle(data_train, 'data', 'data_train.pkl')
    save_to_pickle(data_test, 'data', 'data_test.pkl')
示例#2
0
import random
import matplotlib.pyplot as plt

from DataTools.pickle import save_to_pickle, load_from_pickle


def n_random_integers(n, low=0, high=10):
    ''' generate random numbers with random.randint'''
    ii = []
    for i in range(n):
        ii.append(random.randint(low, high))
    return np.array(ii)


if __name__ == '__main__':
    df_30 = load_from_pickle('data', 'data_30min.pkl')

    # single timeseries of global power
    kw = df_30.Global_active_power  # power in kW

    # clip to whole days
    firstday = '2006-12-17 00:00:00'
    lastday = '2010-11-25 23:30:00'

    kw = kw[kw.index >= firstday]
    kw = kw[kw.index <= lastday]

    # array of single day timeseries
    delta_t = kw.index[1] - kw.index[0]  # size of timestep
    n_ts = int(datetime.timedelta(days=1) / delta_t)  # number of timesteps
    n_rows = int(len(kw) / n_ts)  # number of rows
import os
import pandas as pd
import numpy as np
from DataTools.pickle import save_to_pickle, load_from_pickle

import matplotlib.pyplot as plt
import seaborn as sns

if __name__ == '__main__':
    series = load_from_pickle('data', 'data_369.pkl')
    vals = series.values

    # remove zeros before ts data
    idx = np.argmax(vals > 0)
    vals = vals[idx:]

    # smooth with moving average filter
    mva = pd.rolling_mean(vals, 500)

    # choose raw or pre-smoothed data
    data = vals

    # split dataset
    split_idx = 80000
    window = 150
    train = data[:split_idx]
    test = data[split_idx:]
    test_window = test[:window]

    # statsmodels autoregression
    run_ar = False
    model = Sequential()
    model.add(
        Conv1D(filters=16,
               kernel_size=3,
               activation='relu',
               input_shape=(n_timesteps, n_features)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    return model


if __name__ == '__main__':
    train_df = load_from_pickle('data', 'data_train.pkl')
    test_df = load_from_pickle('data', 'data_test.pkl')

    # TRANSFORM to np array
    # columns: ['Global_active_power', 'Global_reactive_power', 'Voltage',
    #   'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
    #   'Sub_metering_3', 'Sub_metering_4']
    train_data = train_df.values
    test_data = test_df.values

    # TRANSFORM to CNN input shape
    # 1D CNN input shape: [n_samples, n_timesteps_per_sample, n_features]
    #                       e.g. [159 (week), 7(days), 1 (feature)] (or 8 features)
    train_data = train_data.reshape(int(train_data.shape[0] / 7), 7,
                                    train_data.shape[1])
    test_data = test_data.reshape(int(test_data.shape[0] / 7), 7,
    # indices that contain independent sets of values
    ii = np.linspace(n, len(data), int(len(data) / 7))
    ii = np.insert(ii, 0, 0)
    ii = np.delete(ii, -1)
    ii = ii.astype(int)

    yy = []
    for i in ii:
        yy.append(list(data[i]))  # append lists
        flat = [item for sublist in yy for item in sublist]
    return flat


if __name__ == '__main__':
    # load output and test set
    (true, pred, errors) = load_from_pickle('output', 'output_1.pkl')
    test_df = load_from_pickle('data', 'data_test.pkl')
    test = test_df.values

    # get single timseries for true and pred (pred is first day predition)
    n_days = 7
    yy_true = timeseries_from_staggered_timeseries_sets(true, n_days)
    yy_pred = timeseries_from_staggered_timeseries_sets(pred, n_days)

    # plot true vs predicted
    fname = 'output_1_predictions'
    plt.plot(yy_true, 'b', label='true')
    plt.plot(yy_pred, 'orange', label='predicted', linewidth=2)
    plt.ylabel('power usage [kW]')
    plt.xlabel('test period [days]')
    plt.legend()
示例#6
0
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from collections import Counter
import matplotlib.pyplot as plt

from DataTools.pickle import save_to_pickle, load_from_pickle

def n_random_integers(n, low=0, high=10):
    ''' generate random numbers with random.randint'''
    ii = []
    for i in range(n):
        ii.append(random.randint(low, high))
    return np.array(ii)

if __name__=='__main__':
    data = load_from_pickle('data','daily_array_all.pkl')
    # shape: (1440, 48)

    # cluster as gaussian mixture
    X = data
    n = 10
    gmm = GaussianMixture(n_components=n)
    gmm.fit(X)
    y = gmm.predict(X)
    probs = gmm.predict_proba(X)

    # sort results into clusters based on labels
    def clusters_from_lables(X,y):
        labels = np.unique(y)
        clusters = []
        for label in labels:
示例#7
0
        rmse = np.sqrt(mse)
        errors.append(rmse)
    return np.array(errors)


if __name__ == '__main__':
    # load model and compile
    with open('models/model_1.json', 'r') as f:
        model_json = json.load(f)
    model = model_from_json(model_json)
    model.compile(loss='mse', optimizer='adam')

    # load model weights
    model.load_weights('models/model_1.h5')
    print('model and weights loaded')

    # load data: train set in inputs/outputs, test set
    X_train = load_from_pickle('data_Xy', 'X_train.pkl')
    y_train = load_from_pickle('data_Xy', 'y_train.pkl')
    test_df = load_from_pickle('data', 'data_test.pkl')
    test = test_df.values

    # evaluate on test set: univariate
    feat_col = 0
    test = test[:, feat_col]
    true, pred = walk_forward_validation(test, model, n_input=7)

    # score predictions, save to file
    errors = calc_rmse_error(true, pred)
    save_to_pickle((true, pred, errors), 'output', 'output_1.pkl')