forked from Juliusvk/SSLCauseEffect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SSLCauseEffect.py
465 lines (383 loc) · 17.4 KB
/
SSLCauseEffect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
"""
This script contains code to run experiments for semi-supervised learning (SSL) with cause and effect features.
"""
from scipy.stats import multivariate_normal
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from scikitTSVM import SKTSVM
def sigmoid(x):
"""
computes the logistic sigmoid function evaluated at input x
:param x: (n x 1) np.array of reals
:return: (n x 1) np.array of probabilities
"""
return 1/(1 + np.exp(-x))
def fy_linear(x, a, b):
"""
computes the linear function x*W + b , e.g. the logit class probs (the input to the sigmoid) from causal features
:param x: (n x d) np.array of reals - causal features X_C
:param a: (d x p) weight matrix
:param b: (1 x p) bias term
:return: (n x p) np.array of reals
"""
return np.matmul(x, a) + b
def predict_class_probs(x_c, x_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1):
"""
Assigns examples of the form (x_c, x_e) to the more likely class given the (estimated) parameters.
:param x_c: (n x d_c) array of causal features
:param x_e: (n x d_e) array of effect features
:param a_y: (d_c x 1) np.array of weights for logistic regression of Y on X_C
:param b_y: (1 x 1) bias term for logistic regression of Y on X_C
:param a_e0: (d_c x d_e) np.array of weights for map X_C, Y=0 -> X_E
:param a_e1: (d_c x d_e) np.array of weights for map X_C, Y=1 -> X_E
:param b_0: (1 x d_e) np.array bias for class Y=0
:param b_1: (1 x d_e) np.array bias for class Y=1
:param cov_e0: (d_e x d_e) np.array covariance for noise N_E |Y=0
:param cov_e1: (d_e x d_e) np.array covariance for noise N_E |Y=1
:return: (n x 1) array of class probabilities representing P(Y=1 | X_C, X_E)
"""
py1 = sigmoid(fy_linear(x_c, a_y, b_y)) # P(Y=1 |X_C)
mean0 = np.matmul(x_c, a_e0) + b_0
pe0 = np.zeros(py1.shape)
for i in range(py1.shape[0]):
pe0[i] = multivariate_normal.pdf(x_e[i], mean0[i], cov_e0) # P(X_E| X_C, Y=0)
mean1 = np.matmul(x_c, a_e1) + b_1
pe1 = np.zeros(py1.shape)
for i in range(py1.shape[0]):
pe1[i] = multivariate_normal.pdf(x_e[i], mean1[i], cov_e1) # P(X_E| X_C, Y=1)
prelim = np.multiply(py1, pe1)
p1 = np.divide(prelim, prelim + np.multiply(1-py1, pe0))
return p1
def get_log_reg_params(x, y, weight=None):
lr_c_only = LogisticRegression(random_state=0, solver='liblinear')
lr_c_only.fit(x, y.ravel(), sample_weight=weight)
return np.transpose(lr_c_only.coef_), lr_c_only.intercept_
def get_weighted_lin_reg_params(x_c, x_e, w, lam=1e-2):
ridge = Ridge().fit(x_c, x_e, sample_weight=w)
a = np.transpose(ridge.coef_)
b = ridge.intercept_.reshape((1, x_e.shape[1]))
sq_res = np.square(x_e - ridge.predict(x_c))
sum_weighted_sq_res = np.sum(np.multiply(w.reshape((w.shape[0], 1)), sq_res), axis=0)
cov = np.diag(np.divide(sum_weighted_sq_res, np.sum(w)))
cov += np.diag(lam * np.ones((x_e.shape[1], x_e.shape[1])))
if np.linalg.matrix_rank(cov) < x_e.shape[1]:
print 'MATRIX SINGULAR IN WEIGHTED REG'
return a, b, cov
def get_lin_reg_params(x_c, x_e, lam=1e-2):
ridge = Ridge().fit(x_c, x_e)
a = np.transpose(ridge.coef_)
b = ridge.intercept_.reshape((1, x_e.shape[1]))
sum_sq_res = np.sum(np.square(x_e - ridge.predict(x_c)), axis=0)
cov = np.diag(np.divide(sum_sq_res, x_c.shape[0]))
cov += np.diag(lam * np.ones((x_e.shape[1], x_e.shape[1])))
if np.linalg.matrix_rank(cov) < x_e.shape[1]:
print 'MATRIX SINGULAR'
return a, b, cov
def hard_label_EM(x_c, y, x_e, z_c, z_e):
c = np.concatenate((x_c, z_c))
e = np.concatenate((x_e, z_e))
# initialise from labelled data
a_y, b_y = get_log_reg_params(x_c, y)
idx_0 = np.where(y == 0)[0]
idx_1 = np.where(y == 1)[0]
a_e0, b_0, cov_e0 = get_lin_reg_params(x_c[idx_0], x_e[idx_0])
a_e1, b_1, cov_e1 = get_lin_reg_params(x_c[idx_1], x_e[idx_1])
converged = False
u_old = np.zeros((z_c.shape[0], 1))
while not converged:
# E-step: compute labels
p1 = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1)
u = p1 > 0.5
# Check for convergence
if (u_old == u).all():
break
# M-step:
l = np.concatenate((y, u))
a_y, b_y = get_log_reg_params(c, l)
a_e0, b_0, cov_e0 = get_weighted_lin_reg_params(c, e, 1-l.ravel())
a_e1, b_1, cov_e1 = get_weighted_lin_reg_params(c, e, l.ravel())
u_old = u
return a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1
def soft_label_EM(x_c, y, x_e, z_c, z_e, converged=False, tol=1e-2):
c = np.concatenate((x_c, z_c))
e = np.concatenate((x_e, z_e))
# initialise from labelled data
a_y, b_y = get_log_reg_params(x_c, y)
idx_0 = np.where(y == 0)[0]
idx_1 = np.where(y == 1)[0]
a_e0, b_0, cov_e0 = get_lin_reg_params(x_c[idx_0], x_e[idx_0])
a_e1, b_1, cov_e1 = get_lin_reg_params(x_c[idx_1], x_e[idx_1])
u_old = np.zeros((z_c.shape[0], 1))
while not converged:
# E-step: compute labels
p1 = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1)
p = np.concatenate((y, p1)) # probabilities for class 1
u = p1 > 0.5
l = np.concatenate((y, u))
w = (l * p + (1-l) * (1-p)).ravel()
# M-step:
a_y, b_y = get_log_reg_params(c, l, w)
a_e0, b_0, cov_e0 = get_weighted_lin_reg_params(c, e, 1-p.ravel())
a_e1, b_1, cov_e1 = get_weighted_lin_reg_params(c, e, p.ravel())
# Check for convergence
if max(np.abs(u_old-p1)) < tol:
converged = True
else:
u_old = p1
return a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1
def conditional_prop(x_c, y, x_e, z_c, z_y, z_e):
y_unl = -1 * np.ones((z_c.shape[0],1))
idx_0 = np.where(y == 0)[0]
idx_1 = np.where(y == 1)[0]
c_0 = x_c[idx_0]
c_1 = x_c[idx_1]
e_0 = x_e[idx_0]
e_1 = x_e[idx_1]
# initialise from labelled data
R0 = Ridge().fit(c_0, e_0)
R1 = Ridge().fit(c_1, e_1)
while sum(y_unl == -1) > 0:
# check what is still unlabelled
idx_remaining = np.where(y_unl == -1)[0]
# predict for unlabelled points
err_0 = np.sum(np.square(z_e[idx_remaining] - R0.predict(z_c[idx_remaining])), axis=1)
err_1 = np.sum(np.square(z_e[idx_remaining] - R1.predict(z_c[idx_remaining])), axis=1)
# assign best fitting point to correct label
if min(err_0) < min(err_1):
# update R0
idx = idx_remaining[np.argmin(err_0)]
y_unl[idx] = 0
np.append(c_0, z_c[idx])
np.append(e_0, z_e[idx])
R0 = Ridge().fit(c_0, e_0)
else:
# update R1
idx = idx_remaining[np.argmin(err_1)]
y_unl[idx] = 1
np.append(c_1, z_c[idx])
np.append(e_1, z_e[idx])
R1 = Ridge().fit(c_1, e_1)
return np.mean(y_unl == z_y)
def disc_eff_semigen(x_c, y, x_e, z_c, z_e, weight=None):
"""
This function fits a semi-generative model for categorical effect features, using a logistic regression not only for
P(Y|X_C), but also for P(X_E|Y,X_C). The model is fitted on the labelled input data (x_c,y,x_e) and then applied to
the unlabelled sample (z_c,z_e) to predict label probabilities.
:param x_c: causal features of labelled data
:param y: labels
:param x_e: effect features of labelled data
:param z_c: causal features of unlabelled data
:param z_e: effect features of unlabelled data
:param weight: weight associated with the labelled samples
:return:
"""
# LR fit for P(Y|X_C)
LRC = LogisticRegression(random_state=0, solver='lbfgs')
LRC.fit(x_c, y.ravel(), sample_weight=weight)
# LR fits for P(X_E|Y,X_C)
idx_0 = np.where(y == 0)[0]
idx_1 = np.where(y == 1)[0]
LR0 = LogisticRegression(random_state=0, multi_class='ovr', solver='lbfgs')
LR1 = LogisticRegression(random_state=0, multi_class='ovr', solver='lbfgs')
if weight is None:
LR0.fit(x_c[idx_0], x_e[idx_0].ravel())
LR1.fit(x_c[idx_1], x_e[idx_1].ravel())
else:
LR0.fit(x_c[idx_0], x_e[idx_0].ravel(), sample_weight=weight[idx_0])
LR1.fit(x_c[idx_1], x_e[idx_1].ravel(), sample_weight=weight[idx_1])
p_y = LRC.predict_proba(z_c) # P(Y|Z_C)
p_e0 = LR0.predict_proba(z_c) # P(X_E|Y=0,Z_C)
p_e1 = LR1.predict_proba(z_c) # P(X_E|Y=1,Z_C)
p_ze1 = np.zeros((z_e.shape[0], 1))
p_ze0 = np.zeros((z_e.shape[0], 1))
for i in range(z_e.shape[0]):
p_ze1[i] = p_e1[i, z_e.astype(int)[i]] # P(Z_E|Y=1,Z_C)
p_ze0[i] = p_e0[i, z_e.astype(int)[i]] # P(Z_E|Y=0,Z_C)
# P(Y_u=1|Z_C,Z_E)
z_y = p_ze1 * p_y[:, 1].reshape((-1, 1)) / (
p_ze1 * p_y[:, 1].reshape((-1, 1)) + p_ze0 * p_y[:, 0].reshape((-1, 1)))
return z_y
def disc_eff_hard_EM(x_c, y, x_e, z_c, z_y, z_e):
"""
This function implements an EM style algorithm to fit a generative model with categorical, rather than continuous
effect features.
:param x_c: causal features of labelled data
:param y: labels
:param x_e: effect features of labelled data
:param z_c: causal features of unlabelled data
:param z_y: labels to be predicted
:param z_e: effect features of unlabelled data
:return:
"""
c = np.concatenate((x_c, z_c))
e = np.concatenate((x_e, z_e))
# initialise from labelled sample
z_y_soft = disc_eff_semigen(x_c, y, x_e, z_c, z_e)
u_old = z_y_soft > 0.5
converged = False
while not converged:
y_joint = np.concatenate((y, u_old))
z_y_soft = disc_eff_semigen(c, y_joint, e, z_c, z_e)
u_new = z_y_soft > 0.5
if (u_new == u_old).all():
converged = True
u_old = u_new
return np.mean(z_y == u_old)
def disc_eff_soft_EM(x_c, y, x_e, z_c, z_y, z_e):
"""
This function implements an EM style algorithm to fit a generative model with categorical, rather than continuous
effect features.
:param x_c: causal features of labelled data
:param y: labels
:param x_e: effect features of labelled data
:param z_c: causal features of unlabelled data
:param z_y: labels to be predicted
:param z_e: effect features of unlabelled data
:return:
"""
c = np.concatenate((x_c, z_c))
e = np.concatenate((x_e, z_e))
# initialise from labelled sample
z_y_soft = disc_eff_semigen(x_c, y, x_e, z_c, z_e)
u_old = z_y_soft > 0.5
converged = False
while not converged:
# compute weights
p = np.concatenate((y, z_y_soft)) # probabilities for class 1
y_joint = np.concatenate((y, u_old)) # current labelling
w = (y_joint * p + (1 - y_joint) * (1 - p)).ravel() # weights
# perform weighted model fitting
z_y_soft_new = disc_eff_semigen(c, y_joint, e, z_c, z_e, w)
u_old = z_y_soft_new > 0.5
if max(np.abs(z_y_soft - z_y_soft_new)) < 1e-2:
converged = True
z_y_soft = z_y_soft_new
return np.mean(z_y == u_old)
def disc_cond_prop(x_c, y, x_e, z_c, z_y, z_e):
y_unl = -1 * np.ones((z_c.shape[0], 1))
idx_0 = np.where(y == 0)[0]
idx_1 = np.where(y == 1)[0]
c_0 = x_c[idx_0]
c_1 = x_c[idx_1]
e_0 = x_e[idx_0]
e_1 = x_e[idx_1]
# initialise from labelled data
LR0 = LogisticRegression(random_state=0, multi_class='ovr', solver='lbfgs').fit(c_0, e_0.ravel())
LR1 = LogisticRegression(random_state=0, multi_class='ovr', solver='lbfgs').fit(c_1, e_1.ravel())
while sum(y_unl == -1) > 0:
# check what is still unlabelled
idx_remaining = np.where(y_unl == -1)[0]
# predict for unlabelled points
p_e0 = LR0.predict_proba(z_c[idx_remaining]) # P(X_E|Y=0,Z_C)
p_e1 = LR1.predict_proba(z_c[idx_remaining]) # P(X_E|Y=1,Z_C)
p_ze1 = np.zeros((z_e[idx_remaining].shape[0], 1))
p_ze0 = np.zeros((z_e[idx_remaining].shape[0], 1))
for i in range(z_e[idx_remaining].shape[0]):
p_ze1[i] = p_e1[i, z_e[idx_remaining].astype(int)[i]] # P(Z_E|Y=1,Z_C)
p_ze0[i] = p_e0[i, z_e[idx_remaining].astype(int)[i]] # P(Z_E|Y=0,Z_C)
err_0 = 1 - p_ze0
err_1 = 1 - p_ze1
# assign best fitting point to correct label
if min(err_0) < min(err_1):
# update R0
idx = idx_remaining[np.argmin(err_0)]
y_unl[idx] = 0
np.append(c_0, z_c[idx])
np.append(e_0, z_e[idx])
LR0 = LogisticRegression(random_state=0, multi_class='ovr', solver='lbfgs').fit(c_0, e_0.ravel())
else:
# update R1
idx = idx_remaining[np.argmin(err_1)]
y_unl[idx] = 1
np.append(c_1, z_c[idx])
np.append(e_1, z_e[idx])
LR1 = LogisticRegression(random_state=0, multi_class='ovr', solver='lbfgs').fit(c_1, e_1.ravel())
return np.mean(y_unl == z_y)
def run_methods(x_c, y, x_e, z_c, z_y, z_e):
x = np.concatenate((x_c, x_e), axis=1)
z = np.concatenate((z_c, z_e), axis=1)
# Baseline: Linear Logistic Regression
lin_lr = LogisticRegression(random_state=0, solver='liblinear').fit(x, y.ravel())
acc_lin_lr = lin_lr.score(z, z_y)
# hard_label_lin_lr = lin_lr.predict(z)
# soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1]
# TRANSDUCTIVE APPROACHES
# merge labelled and unlabelled data (with label -1) for transductive methods
x_merged = np.concatenate((x, z))
y_merged = np.concatenate((y, -1 * np.ones((z.shape[0], 1)))).ravel().astype(int)
# Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
lin_tsvm = SKTSVM(kernel='linear')
lin_tsvm.fit(x_merged, y_merged)
acc_lin_tsvm = lin_tsvm.score(z, z_y)
# hard_label_lin_tsvm = lin_tsvm.predict(z)
# soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1]
# Baseline: Non-Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
rbf_tsvm = SKTSVM(kernel='RBF')
rbf_tsvm.fit(x_merged, y_merged)
acc_rbf_tsvm = rbf_tsvm.score(z, z_y)
# hard_label_rbf_tsvm = rbf_tsvm.predict(z)
# soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1]
# Baseline: Label Propagation RBF weights
try:
rbf_label_prop = LabelPropagation(kernel='rbf')
rbf_label_prop.fit(x_merged, y_merged)
acc_rbf_label_prop = rbf_label_prop.score(z, z_y)
# hard_label_rbf_label_prop= rbf_label_prop.predict(z)
# soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1]
except:
acc_rbf_label_prop = []
print 'rbf label prop did not work'
# Baseline: Label Spreading with RBF weights
try:
rbf_label_spread = LabelSpreading(kernel='rbf')
rbf_label_spread.fit(x_merged, y_merged)
acc_rbf_label_spread = rbf_label_spread.score(z, z_y)
# hard_label_rbf_label_spread = rbf_label_spread.predict(z)
# soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1]
except:
acc_rbf_label_spread = []
print 'rbf label spread did not work '
# THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K
# Baseline: Label Propagation with k-NN weights
try:
knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11)
knn_label_prop.fit(x_merged, y_merged)
acc_knn_label_prop = knn_label_prop.score(z, z_y)
# hard_label_knn_label_prop = knn_label_prop.predict(z)
# soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1]
except:
acc_knn_label_prop = []
print 'knn label prop did not work'
# Baseline: Label Spreading with k-NN weights
try:
knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11)
knn_label_spread.fit(x_merged, y_merged)
acc_knn_label_spread = knn_label_spread.score(z, z_y)
# hard_label_knn_label_spread = knn_label_spread.predict(z)
# soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1]
except:
acc_knn_label_spread = []
print 'knn label spread did not work'
# Generative Models
# Semi-generative model on labelled data only
a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(x_c, y, x_e, z_c, z_e, converged=True)
soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1)
hard_label_semigen = soft_label_semigen > 0.5
acc_semigen_labelled = np.mean(hard_label_semigen == z_y)
# EM with soft labels
a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(x_c, y, x_e, z_c, z_e)
soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1)
hard_label_soft_EM = soft_label_soft_EM > 0.5
acc_soft_EM = np.mean(hard_label_soft_EM == z_y)
# EM with hard labels
a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM(x_c, y, x_e, z_c, z_e)
soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1)
hard_label_hard_EM = soft_label_hard_EM > 0.5
acc_hard_EM = np.mean(hard_label_hard_EM == z_y)
# Conditional label prop
acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e)
return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\
acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop