/
bayesian_forecast_combination.py
394 lines (312 loc) · 18.7 KB
/
bayesian_forecast_combination.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
'''
Created on 31 Jul 2015
@author: edwin
'''
import numpy as np
from scipy.special import psi
from scipy.linalg import cholesky, solve_triangular
import logging
class BayesianForecasterCombination():
"""
Bayesian Crowd Forecasting? Indepenedent...
# TO DO:
# 1. Prediction function not returning the right results -- needs to include observed values of x when available.
# 2. Kernel for time dimension needs to be changed to a linear one as in simulated data sampler.
"""
# Data Dimensions --------------------------------------------------------------------------------------------------
F = 1 # number of forecasters
P = 1 # number of forecast periods
T = 1 # length of each forecast period
N = 1 # P x T
# Posterior variances over the model params at each data point
l_time = 10 # length scale for the forecasters' variation over time. Could be extended to differe for each forecaster.
l_target = 10 # length scale fore the forecasters' variation over y-space.
l_y = None # lengthscale for time
# Hyperparameters (priors) -----------------------------------------------------------------------------------------
l0_time = 10 # length scale for the forecasters' variation over time. Could be extended to differe for each forecaster.
l0_target = 10 # length scale fore the forecasters' variation over y-space.
mu0_a = 1 # Prior mean signal strength for all models.
s0_a = 1 # output scale
mu0_c = 0 # Prior mean bias
s0_c = 1 # output scale of bias
m_mu0_y = 0 # Hyperprior for the targets
v_mu0_y = 1
l0_y = 3 # length scale for the targets
s0_y = 1 # output scale for the targets
shape0_Lambda = 1
scale0_Lambda = 1
shape0_lambda = 1
scale0_Lambda = 1
def __init__(self, x, y, times, periods):
# Observations -------------------------------------------------------------------------------------------------
# N x 1 target values, including training labels and predictions. Test indexes values should initially be NaNs.
y = np.array(y)
if y.ndim==1:
y = y[:, np.newaxis]
self.y = y
# N x F observations of individual forecasts. Missing observations are NaNs.
x = np.array(x)
if x.ndim==1:
x = x[:, np.newaxis]
self.x = x
self.N = len(y)
self.F = x.shape[1]
self.silveridxs = np.isnan(self.y).flatten()
self.goldidxs = (np.isnan(self.y) == False).flatten()
self.y[self.silveridxs, :] = self.m_mu0_y
# N time values. If none are give, we assume that the N observations are in time order and evenly spaced.
times = np.array(times, dtype=float)
if times.ndim==1:
times = times[:, np.newaxis]
self.times = times
self.T = np.max(self.times) + 1
if not self.l_time:
self.l_time = self.l0_time
if not self.l_target:
self.l_target = self.l0_target
periods = np.array(periods)
if periods.ndim == 2:
periods = periods.reshape(-1)
self.periods = periods # N index values indicating which period each forecast relates to.
self.P = np.max(self.periods) + 1
# Model Parameters (latent variables) ------------------------------------------------------------------------------
d_time = self.times - self.times.T
self.K_time = self.sqexpkernel(d_time, self.l_time)
d_y = self.y - self.y.T # using first and second order Taylor expansions for the uncertain inputs.
self.K_target = self.sqexpkernel(d_y, self.l_target)
self.K = self.K_time * self.K_target + 1e-6 * np.eye(self.N)
# Posterior expectations at each data point
self.a = np.ones((self.N, self.F)) # Inverse signal strength for the ground truth. Varies depending on y and time.
self.s_a = np.zeros(self.F) + self.s0_a
self.cov_a = np.zeros((self.F, self.N, self.N))
for f in range(self.F):
self.cov_a[f, :, :] = self.s_a[f] * self.K
self.c = np.zeros((self.N, self.F)) # Bias offset. Expected value = posterior mean. Varies depending on y and time.
self.s_c = np.zeros(self.F) + self.s0_c
self.cov_c = np.zeros((self.F, self.N, self.N))
for f in range(self.F):
self.cov_c[f, :, :] = self.s_c[f] * self.K
self.b = np.ones((self.P, self.F)) # Noise precision scale, one value for each run/period.
self.Lambda_e = {} # F x N x N Noise precision varies over y and time.
self.e = np.zeros((self.N, self.F)) # Noise value for individual data points. Prior mean zero.
self.cov_e = {}
for f in range(self.F):
self.cov_e[f] = np.zeros((self.N, self.N))
shape_Lambda = self.shape0_Lambda
scale_Lambda = self.scale0_Lambda * self.K_time[0:self.T, :][:, 0:self.T]
self.Lambda_e[f] = scale_Lambda / shape_Lambda
for p in range(self.P):
pidxs = self.periods==p
self.cov_e[f][np.ix_(pidxs, pidxs)] = self.Lambda_e[f] * self.K_target[pidxs][:, pidxs] / self.b[p, f] + np.eye(self.T) * 1e-6
self.lambda_e = np.zeros(self.F) # F degrees of freedom in student's t noise distribution. Constant for each forecaster.
distances = self.times - self.times.T # Ntest x N
nonmatchingperiods = (self.periods[:, np.newaxis] - self.periods[np.newaxis, :]) != 0
distances[nonmatchingperiods] = np.inf
self.l_y = self.l0_y
self.s_y = self.s0_y
self.K_y = self.sqexpkernel(distances, self.l_y) + 1e-6 * np.eye(self.N)
self.cov_y = self.s_y * self.K_y
self.cov_y[self.goldidxs, :] = 0
self.cov_y[:, self.goldidxs] = 0
def sqexpkernel(self, d, l):
K = np.exp(- 0.5 * d**2 / l**2 )
return K
def fit(self):
"""
Run VB to fit the model and predict the latent variables y at the same time.
"""
tolerance = 1e-3
change = np.inf
maxiter = 100
niter = 0
while change > tolerance and niter < maxiter:
y_old = np.copy(self.y)
d_y = self.y - self.y.T # using first and second order Taylor expansions for the uncertain inputs.
self.K_target = self.sqexpkernel(d_y, self.l_target)
self.K = self.K_time * self.K_target + 1e-6 * np.eye(self.N)
self.L_K = cholesky(self.K, lower=True, check_finite=False)
self.expec_y() # begin by estimating y from sensible priors
self.expec_c() # find the added bias
self.expec_a() # find any scaling bias
self.expec_e() # find the noise values
self.expec_Lambda_b() # find the noise parameters common to all runs
change = np.max(np.abs(self.y - y_old))
niter += 1
logging.debug("Completed iteration " + str(niter) + ", change = " + str(change))
def predict(self, testtimes, testperiods):
"""
Use the posterior GP over y to interpolate and predict the specified times and periods.
"""
y, cov = self.posterior_y(testtimes, testperiods)
v_y = np.diag(cov)
return y, v_y
def expec_y(self):
mu, cov = self.posterior_y()
self.y[self.silveridxs, :] = mu
self.cov_y[ np.ix_(self.silveridxs, self.silveridxs) ] = cov
# update hyper-parameters as necessary
# shape0_s = 1
# rate0_s = self.s0_y * shape0_s
# shape_s = shape0_s + 0.5 * self.N
#
# rate_s = rate0_s
# for p in range(self.P):
# pidxs = self.periods == p
# L_Ky = cholesky(self.K_y[pidxs][:, pidxs], lower=True, check_finite=False)
# devs = self.y[pidxs, :] - self.mu0_y
# var_y = np.diag(np.diag(self.cov_y[pidxs][:, pidxs]))
# B = solve_triangular(L_Ky, devs.dot(devs.T) + var_y, lower=True, overwrite_b=True)
# A = solve_triangular(L_Ky.T, B, overwrite_b=True)
# rate_s += 0.5 * np.trace(A)
# self.s_y = rate_s / shape_s # inverse of precision
def posterior_y(self, predict_times=None, predict_periods=None):
K_train = self.s_y * self.K_y
K_gold = K_train[self.goldidxs, :][:, self.goldidxs]
if not np.any(predict_times) or not np.any(predict_periods):
K_predict = K_train
silveridxs = self.silveridxs
testidxs = self.silveridxs
else:
predict_times = np.concatenate((self.times[self.silveridxs], predict_times), axis=0)
distances = predict_times - predict_times.T # Ntest x N
nonmatchingperiods = (predict_periods - predict_periods.T) != 0
distances[nonmatchingperiods] = np.inf
K_predict = self.sqexpkernel(distances, self.l_y) + 1e-6 * np.eye(self.N)
silveridxs = np.arange(1, np.sum(self.silveridxs))
testidxs = np.arange(np.sum(self.silveridxs), len(predict_times))
# update the prior mean
v_obs_y = np.var(self.y)
self.mu0_y = (self.m_mu0_y * v_obs_y + np.mean(self.y) * self.v_mu0_y) / (self.v_mu0_y + v_obs_y)
print "mu0_y = %.3f" % self.mu0_y
# learn from the training labels
innovation = self.y[self.goldidxs, :] - self.mu0_y
L_y = cholesky(K_gold, lower=True, check_finite=False)
B = solve_triangular(L_y, innovation, lower=True, overwrite_b=True, check_finite=False)
A = solve_triangular(L_y.T, B, overwrite_b=True, check_finite=False)
V = solve_triangular(L_y, K_predict[:, self.goldidxs].T, lower=True, check_finite=False)
mu = self.mu0_y + K_predict[testidxs][:, self.goldidxs].dot(A)
cov = K_predict - V.T.dot(V)
# now update the test indexes from the x observations
for f in range(self.F):
mu_fminus1 = mu
cov_f = cov[silveridxs][:, silveridxs]# + 1e-6 * np.eye(len(mu)) # jitter
innovation = self.x[self.silveridxs, f:f+1] - (mu_fminus1 * self.a[self.silveridxs, f:f+1]
+ self.c[self.silveridxs, f:f+1] + self.e[self.silveridxs, f:f+1]) # observation minus prior over forecasters' predictions
print np.min(innovation)
a_diag = np.diag(self.a[self.silveridxs, f])
var_a = np.diag(np.diag(self.cov_a[f, self.silveridxs][:, self.silveridxs]))
var_a = np.diag(mu_fminus1.reshape(-1)).dot(var_a).dot(np.diag(mu_fminus1.reshape(-1)).T)
var_e = np.diag(np.diag(self.cov_e[f][self.silveridxs][:, self.silveridxs]))
var_c = np.diag(np.diag(self.cov_c[f, self.silveridxs][:, self.silveridxs]))
S_y = cov_f + var_a + var_e + var_c
L_y = cholesky(S_y, lower=True, check_finite=False)
B = solve_triangular(L_y, innovation, lower=True, overwrite_b=True, check_finite=False)
A = solve_triangular(L_y.T, B, overwrite_b=True, check_finite=False)
V = solve_triangular(L_y, a_diag.dot(cov[silveridxs, :]), lower=True, overwrite_b=True, check_finite=False)
mu = mu_fminus1 + cov[silveridxs][:, silveridxs].dot(a_diag).dot(A)
cov = cov - V.T.dot(V)
return mu, cov[testidxs][:, testidxs]
def expec_a(self):
for f in range(self.F):
innovation = self.x[:, f:f+1] - (self.y * self.mu0_a + self.c[:, f:f+1] + self.e[:, f:f+1]) # observation minus prior over forecasters' predictions
K = self.s_a[f] * self.K
y_diag = np.diag(self.y.reshape(-1))
var_y = np.diag(np.diag(self.cov_y))
var_c = np.diag(np.diag(self.cov_c[f]))
var_e = np.diag(np.diag(self.cov_e[f]))
S_a = y_diag.dot(K).dot(y_diag.T) + self.mu0_a**2 * var_y + var_c + var_e
La = cholesky(S_a, lower=True, overwrite_a=True, check_finite=False)
B = solve_triangular(La, innovation, lower=True, overwrite_b=True, check_finite=False)
A = solve_triangular(La.T, B, overwrite_b=True, check_finite=False)
V = solve_triangular(La, y_diag.dot(K), check_finite=False, lower=True)
self.a[:, f] = self.mu0_a + K.dot(y_diag).dot(A).reshape(-1)
self.cov_a[f] = K - V.T.dot(V)
rate0_s = self.s0_a
shape_s = 1 + 0.5 * self.N
af = self.a[:, f][:, np.newaxis]
var_a = np.diag(np.diag(self.cov_a[f]))
B = solve_triangular(self.L_K, af.dot(af.T).T + y_diag.dot(var_a).dot(y_diag.T).T, lower=True, overwrite_b=True)
A = solve_triangular(self.L_K.T, B, overwrite_b=True)
rate_s = rate0_s + 0.5 * np.trace(A)
# self.s_a[f] = rate_s / shape_s # inverse of the precision
def expec_c(self):
for f in range(self.F):
innovation = self.x[:, f:f+1] - (self.y * self.a[:, f:f+1] + self.mu0_c + self.e[:, f:f+1]) # observation minus prior over forecasters' predictions
K = self.s_c[f] * self.K
y_diag = np.diag(self.y[:, 0])
a_diag = np.diag(self.a[:, f])
var_a = np.diag(np.diag(self.cov_a[f]))
var_y = np.diag(np.diag(self.cov_y))
var_e = np.diag(np.diag(self.cov_e[f]))
S_c = K + y_diag.dot(var_a).dot(y_diag.T) + a_diag.dot(var_y).dot(a_diag.T) + var_e
Lc = cholesky(S_c, lower=True, overwrite_a=True, check_finite=False)
B = solve_triangular(Lc, innovation, lower=True, overwrite_b=True, check_finite=False)
A = solve_triangular(Lc.T, B, overwrite_b=True, check_finite=False)
V = solve_triangular(Lc, K, check_finite=False, lower=True)
self.c[:, f] = self.mu0_c + K.dot(A).reshape(-1)
self.cov_c[f] = K - V.T.dot(V) # WHY DO SOME DIAGONALS IN THE TRAINING IDXS END UP < 0? RELATED TO LOWER S_C VALUES? -- TRY FIXING COV_Y FIRST. ALSO CHECK Y_DIAG.COV_A.Y_DIAG
rate0_s = self.s0_c
shape_s = 1 + 0.5 * self.N
cf = self.c[:, f][:, np.newaxis] - self.mu0_c
var_c = np.diag(np.diag(self.cov_c[f].T))
B = solve_triangular(self.L_K, cf.T + var_c, lower=True, overwrite_b=True)
A = solve_triangular(self.L_K.T, B, overwrite_b=True)
rate_s = rate0_s + 0.5 * np.trace(A)
# self.s_c[f] = shape_s / rate_s
def expec_e(self):
"""
Noise of each observation
"""
innovation = self.x - (self.y * self.a + self.c) # mu0_e == 0
for f in range(self.F):
inn_f = innovation[:, f][:, np.newaxis]
# UPDATE e -----------------------------
self.cov_e[f] = np.zeros((self.N, self.N))
for p in range(self.P):
pidxs = self.periods==p
inn_fp = inn_f[pidxs]
K = self.K_target[pidxs][:, pidxs] * self.Lambda_e[f] / self.b[p, f] + 1e-6 * np.eye(self.T)
a_diag = np.diag(self.a[pidxs, f])
y_diag = np.diag(self.y[pidxs, 0])
var_c = np.diag(np.diag(self.cov_c[f][pidxs][:, pidxs]))
var_a = np.diag(np.diag(self.cov_a[f][pidxs][:, pidxs]))
var_y = np.diag(np.diag(self.cov_y[pidxs][:, pidxs]))
S_e = K + var_c + y_diag.dot(var_a).dot(y_diag.T) + a_diag.dot(var_y).dot(a_diag.T)
Le = cholesky(S_e, lower=True, overwrite_a=True, check_finite=False)
B = solve_triangular(Le, inn_fp, lower=True, overwrite_b=True, check_finite=False)
A = solve_triangular(Le.T, B, overwrite_b=True, check_finite=False)
V = solve_triangular(Le, K, check_finite=False, lower=True)
self.e[pidxs, f] = K.dot(A).reshape(-1)
self.cov_e[f][np.ix_(pidxs, pidxs)] = K - V.T.dot(V)
def expec_Lambda_b(self):
"""
Parameters of the noise in general -- captures the increase in noise over time, and its relationship with y.
"""
for f in range(self.F):
# UPDATE Lambda ------------------------
shape_Lambda = self.T + 1 + self.shape0_Lambda + self.P
scale_Lambda = self.scale0_Lambda * self.K_time[0:self.T, :][:, 0:self.T]
for p in range(self.P):
pidxs = self.periods==p
inn_f = self.e[pidxs, f:f+1] # deviations from mean of 0
inn_fp = inn_f.dot(inn_f.T) + self.cov_e[f][pidxs][:, pidxs]
scale_Lambda += inn_fp / self.K_target[pidxs][:, pidxs] * self.b[p, f]
self.Lambda_e[f] = scale_Lambda / (shape_Lambda - self.T - 1)# P x P
# UPDATE b --------------------------- Check against bird paper.
shape_b = self.lambda_e[f] + self.T/2.0
expec_log_b = np.zeros(self.P)
for p in range(self.P):
pidxs = self.periods==p
inn_f = self.e[pidxs, f]
var_e = np.diag(np.diag(self.cov_e[f][pidxs][:, pidxs]))
inn_fp = inn_f.dot(inn_f) + var_e
L_Lambda = cholesky(self.Lambda_e[f] * self.K_target[pidxs][:, pidxs] + 1e-6 * np.eye(self.T), lower=True, check_finite=False)
B = solve_triangular(L_Lambda, inn_fp, overwrite_b=True, check_finite=False, lower=True)
A = solve_triangular(L_Lambda.T, B, overwrite_b=True, check_finite=False)
rate_b = self.lambda_e[f] + np.trace(A)/2.0
self.b[p, f] = shape_b / rate_b
expec_log_b[p] = psi(shape_b) - np.log(rate_b)
# UPDATE lambda -----------------------
shape_lambda = self.shape0_lambda + 0.5 * self.N
scale_Lambda = self.scale0_Lambda - 0.5 * np.sum(1 + expec_log_b - self.b[:, f])
self.lambda_e[f] = shape_lambda / scale_Lambda