twitterColumns = [0, 2]
    pollColumns = [1, 3, 4, 5, 6, 7, 8,
                   9]  # avdate, Remain (norm), Leave (norm)
    lh, rh, p = m.getPanda(twitterColumns, pollColumns)
    h_agg, p_agg, p_var = m.aggregate(lh,
                                      rh,
                                      p,
                                      splitPolls=False,
                                      interpolate=interpolate)
    _, p_onl, p_tel = m.aggregate(lh,
                                  rh,
                                  p,
                                  splitPolls=True,
                                  interpolate=interpolate)

    kalmanData = m.getKalmanData(p_agg, h_agg)
    kalmanData_o = m.getKalmanData(p_onl, h_agg)
    kalmanData_t = m.getKalmanData(p_tel, h_agg)

    # 1. Moving Average
    df_orig = kalmanData
    df_ma = df_orig.rolling(3, center=True, closed='both').mean()

    # 2. Loess Smoothing (5% and 15%)
    print("LOESS SMOETHING")
    df_loess_5 = pd.DataFrame(lowess(df_orig,
                                     np.arange(len(df_orig)),
                                     frac=0.05)[:, 1],
                              index=df_orig.index,
                              columns=['remain_perc'])
    df_loess_15 = pd.DataFrame(lowess(df_orig,
    ### Load in data and normalise
    twitterColumns = [0, 2]
    pollColumns = [1, 3, 4, 5, 6, 7, 8,
                   9]  # avdate, Remain (norm), Leave (norm)
    lh, rh, p = m.getPanda(twitterColumns, pollColumns)
    h_agg, p_agg, p_var = m.aggregate(lh,
                                      rh,
                                      p,
                                      splitPolls=False,
                                      interpolate=True)
    p_orig = p_agg.copy()
    h_orig = h_agg.copy()
    p_agg = m.shift_polls(p_agg, tPolls, addFake=addFake)
    h_agg = m.shift_tweets(h_agg, tTwitter)

    kalmanData = m.getKalmanData(p_agg, h_agg)
    startDate = kalmanData.index[0] + dt.timedelta(days=startTrain)
    endDate = dt.datetime(day=23, month=6, year=2016)

    ### FIND KF VARIABLES: 1) R and 2) P0
    # find R
    preds = []
    R_r = p_var['Remain'].mean()
    r = kalmanData['remain_perc'].to_numpy(dtype=float)
    P0_r = r.var() / 10
    H = 1
    K_r = P0_r * H / (H * P0_r * H + R_r)
    K_r = 0.95

    ### KF MODEL ###
    # apply interpolation
Пример #3
0
                 2) evaluates the fit of the optimal model using one-out predictions without retraining
                 3) calculates the performance of the optimal model when increasing training set size """
if __name__ == '__main__':

    startTrain = 53 # index at which to start training (corresponds to 1st of March with interpolation)
    n_lag = 1       # number of lags to include in ARIMA model
    n_diff = 1      # number of differencing steps
    n_ma = 1        # number of ARIMA terms to include

    ### Load in data and normalise
    twitterColumns = [0, 2]
    pollColumns = [1, 3, 4, 5, 6, 7, 8, 9]
    lh, rh, p = m.getPanda(twitterColumns, pollColumns)
    h_agg, p_agg, p_var = m.aggregate(lh, rh, p, splitPolls=False, interpolate=True)

    kalmanData = m.getKalmanData(p_agg, h_agg) # panda that holds both twitter and polling data

    all_data = kalmanData['remain_perc'].iloc[startTrain:]
    remain_data = all_data.values
    dates_train = all_data.index

    # prepare training and test set
    startDate = kalmanData.index[0] + dt.timedelta(days=startTrain + n_lag + n_diff)
    endDate = kalmanData.index[-1]
    pred_dates = pd.date_range(start=startDate, end=endDate)
    end_train = math.floor(len(remain_data) * 0.2)
    predictions = []
    m.setFonts('timeseries')
    test = remain_data[-end_train:]
    train = remain_data[:-end_train]
    history = train.tolist()