forked from blpercha/ebc
/
ebc.py
338 lines (288 loc) · 16.4 KB
/
ebc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
from collections import defaultdict
import random
import sys
import math
import numpy as np
from numpy.ma import divide, outer, sqrt
from numpy.random.mtrand import random_sample
from matrix import SparseMatrix
INFINITE = 1e10
class EBC:
def __init__(self, matrix, n_clusters, max_iterations=10, jitter_max=1e-10, objective_tolerance=0.01):
""" To initialize an EBC object.
Args:
matrix: a instance of SparseMatrix that represents the original distribution
n_clusters: a list of number of clusters along each dimension
max_iterations: the maximum number of iterations before we stop, default to be 10
jitter_max: a small amount of value to add to probabilities to break ties when doing cluster assignments, default to be 1e-10
objective_tolerance: the threshold of difference between two successive objective values for us to stop
Return:
None
"""
if not isinstance(matrix, SparseMatrix):
raise Exception("Matrix argument to EBC is not instance of SparseMatrix.")
# check to ensure matrix is a probability distribution
np.testing.assert_approx_equal(matrix.sum(), 1.0, significant=7,
err_msg='Matrix elements does not sum to 1. Please normalize your matrix.')
self.pXY = matrix # the joint probability distribution e.g. p(X,Y)- the original sparse, multidimensional matrix
self.K = n_clusters # numbers of clusters along each dimension (len(K) = dim)
self.dim = self.pXY.dim # overall dimension of the matrix
self.max_it = max_iterations
self.cXY = None # list of list: cluster assignments along each dimension e.g. [C(X), C(Y), ...]
self.pX = None # marginal probabilities
self.qXhatYhat = None # the approximate probability distribution after clustering e.g. q(X',Y'); need to be SparseMatrix
self.qXhat = None # the marginal clustering distribution in a list e.g. [q(X'), q(Y'), ...]
self.qXxHat = None # the distribution conditioned on the clustering in a list e.g. [q(X|X'), q(Y|Y'), ...]
self.jitter_max = jitter_max # amount to add to cluster assignment scores to break ties
self.objective_tolerance = objective_tolerance # the threshold for us to stop
def run(self, assigned_clusters=None, verbose=True):
""" To run the ebc algorithm.
Args:
assigned_clusters: an optional list of list representing the initial assignment of clusters along each dimension.
Return:
cXY: a list of list of cluster assignments along each dimension e.g. [C(X), C(Y), ...]
objective: the final objective value
max_it: the number of iterations that the algorithm has run
"""
if verbose: print "Running EBC on a %d-d sparse matrix with size %s ..." % (self.dim, str(self.pXY.N))
# Step 1: initialization steps
self.pX = self.calculate_marginals(self.pXY)
if assigned_clusters:
if verbose: print "Using specified clusters, with cluster number on each axis: %s ..." % self.K
self.cXY = assigned_clusters
else:
if verbose: print "Randomly initializing clusters, with cluster number on each axis: %s ..." % self.K
self.cXY = self.initialize_cluster_centers(self.pXY, self.K)
# Step 2: calculate cluster joint and marginal distributions
self.qXhatYhat = self.calculate_joint_cluster_distribution(self.cXY, self.K, self.pXY)
self.qXhat = self.calculate_marginals(self.qXhatYhat) # the cluster marginals along each axis
self.qXxHat = self.calculate_conditionals(self.cXY, self.pXY.N, self.pX, self.qXhat)
# Step 3: iterate through dimensions, recalculating distributions
last_objective = objective = INFINITE
for t in xrange(self.max_it):
if verbose: sys.stdout.write("--> Running iteration %d " % (t + 1)); sys.stdout.flush()
for axis in xrange(self.dim):
self.cXY[axis] = self.compute_clusters(self.pXY, self.qXhatYhat, self.qXhat, self.qXxHat, self.cXY,
axis)
self.ensure_correct_number_clusters(self.cXY[axis], self.K[axis]) # check to ensure correct K
self.qXhatYhat = self.calculate_joint_cluster_distribution(self.cXY, self.K, self.pXY)
self.qXhat = self.calculate_marginals(self.qXhatYhat)
self.qXxHat = self.calculate_conditionals(self.cXY, self.pXY.N, self.pX, self.qXhat)
if verbose: sys.stdout.write("."); sys.stdout.flush()
objective = self.calculate_objective()
if verbose: sys.stdout.write(" objective value = %f\n" % (objective))
if abs(objective - last_objective) < self.objective_tolerance:
if verbose: print "EBC finished in %d iterations, with final objective value %.4f" % (t + 1, objective)
return self.cXY, objective, t + 1
last_objective = objective
if verbose: print "EBC finished in %d iterations, with final objective value %.4f" % (self.max_it, objective)
return self.cXY, objective, self.max_it # hit max iterations - just return current assignments
def compute_clusters(self, pXY, qXhatYhat, qXhat, qXxhat, cXY, axis):
""" Compute the best cluster assignment along a single axis, given all the distributions and clusters on other axes.
Args:
pXY: the original probability distribution matrix
qXhatYhat: the joint distribution over the clusters
qXhat: the marginal distributions of qXhatYhat
qXxhat: the distribution conditioned on the clustering in a list
cXY: current cluster assignments along each dimension
axis: the axis (dimension) over which clusters are being computed
Return:
Best cluster assignment along a single axis as a list
"""
if not isinstance(pXY, SparseMatrix) or not isinstance(qXhatYhat, SparseMatrix):
raise Exception("Arguments to compute_clusters not an instance of SparseMatrix.")
# To assign clusters, we calculate argmin_xhat D(p(Y,Z|x) || q(Y,Z|xhat)),
# where D(P|Q) = \sum_i P_i log (P_i / Q_i)
dPQ = np.zeros(shape=(pXY.N[axis], qXhatYhat.N[axis]))
# iterate though all non-zero elements; here we are making use of the sparsity to reduce computation
for coords, p_i in pXY.nonzero_elements.iteritems():
coord_this_axis = coords[axis]
px = self.pX[axis][coord_this_axis]
p_i = 1 if px == 0 else p_i / px # calculate p(y|x) = p(x,y)/p(x), but we should be careful if px == 0
current_cluster_assignments = [cXY[i][coords[i]] for i in
xrange(self.dim)] # cluster assignments on each axis
for xhat in xrange(self.K[axis]):
current_cluster_assignments[axis] = xhat # temporarily assign dth dimension to this xhat
current_qXhatYhat = qXhatYhat.get(tuple(current_cluster_assignments))
current_qXhat = qXhat[axis][xhat]
q_i = 1.0
if current_qXhatYhat == 0 and current_qXhat == 0:
q_i = 0 # Here we define 0/0=0
else:
q_i *= current_qXhatYhat / current_qXhat
for i in xrange(self.dim):
if i == axis: continue
q_i *= qXxhat[i][coords[i]]
if q_i == 0: # this can definitely happen if cluster joint distribution has zero element
dPQ[coord_this_axis, xhat] = INFINITE
else:
dPQ[coord_this_axis, xhat] += p_i * math.log(p_i / q_i)
# add random jitter to break ties
dPQ += self.jitter_max * random_sample(dPQ.shape)
return list(dPQ.argmin(1)) # return the closest cluster assignment under KL-divergence
def calculate_marginals(self, pXY):
""" Calculate the marginal probabilities given a joint distribution.
Args:
pXY: sparse matrix over which marginals are calculated, e.g. P(X,Y)
Return:
marginals: the calculated marginal probabilities, e.g. P(X)
"""
if not isinstance(pXY, SparseMatrix):
raise Exception("Illegal argument to marginal calculation: " + str(pXY))
marginals = [[0] * Ni for Ni in pXY.N]
for d in pXY.nonzero_elements:
for i in xrange(len(d)):
marginals[i][d[i]] += pXY.nonzero_elements[d]
return marginals
def calculate_joint_cluster_distribution(self, cXY, K, pXY):
""" Calculate the joint cluster distribution q(X',Y') using the current prob distribution and
cluster assignments. (Here we use X' to denote X_hat)
Args:
cXY: current cluster assignments for each axis
K: numbers of clusters along each axis
pXY: original probability distribution matrix
Return:
qXhatYhat: the joint cluster distribution
"""
if not isinstance(pXY, SparseMatrix):
raise Exception("Matrix argument to calculate_joint_cluster_distribution not an instance of SparseMatrix.")
qXhatYhat = SparseMatrix(K) # joint distribution over clusters
for coords in pXY.nonzero_elements:
# find the coordinates of the cluster for this element
cluster_coords = []
for i in xrange(len(coords)):
cluster_coords.append(cXY[i][coords[i]])
qXhatYhat.add_value(tuple(cluster_coords), pXY.nonzero_elements[coords])
return qXhatYhat
def calculate_conditionals(self, cXY, N, pX, qXhat):
""" Calculate the conditional marginal distributions given the clustering distribution, i.e. q(X|X').
Args:
cXY: current cluster assignments
N: lengths of each dimension in the original data matrix
pX: marginal distributions over original data matrix
qXhat: marginal distributions over cluster joint distribution
Return:
conditional_distributions: a list of distribution for each axis, with each element being a list of prob for i-th row/column in this axis.
"""
conditional_distributions = [[0] * Ni for Ni in N]
for i in xrange(len(cXY)):
cluster_assignments_this_dimension = cXY[i]
for j in xrange(len(cluster_assignments_this_dimension)):
cluster = cluster_assignments_this_dimension[j]
if pX[i][j] == 0 and qXhat[i][cluster] == 0:
conditional_distributions[i][j] = 0
else:
conditional_distributions[i][j] = pX[i][j] / qXhat[i][cluster]
return conditional_distributions
def initialize_cluster_centers(self, pXY, K):
""" Initializes the cluster assignments along each axis, by first selecting k centers,
and then map each row to its closet center under cosine similarity.
Args:
pXY: original data matrix
K: numbers of clusters desired in each dimension
Return:
new_C: a list of list of cluster id that the current index in the current axis is assigned to.
"""
if not isinstance(pXY, SparseMatrix):
raise Exception("Matrix argument to initialize_cluster_centers is not an instance of SparseMatrix.")
new_C = [[-1] * Ni for Ni in pXY.N]
for axis in xrange(len(K)): # loop over each dimension
# choose cluster centers
axis_length = pXY.N[axis]
center_indices = random.sample(xrange(axis_length), K[axis])
cluster_ids = {}
for i in xrange(K[axis]): # assign identifiers to clusters
center_index = center_indices[i]
cluster_ids[center_index] = i
centers = defaultdict(lambda: defaultdict(float)) # all nonzero indices for each center
for coords in pXY.nonzero_elements:
coord_this_axis = coords[axis]
if coord_this_axis in cluster_ids: # is a center
reduced_coords = tuple(
[coords[i] for i in xrange(len(coords)) if i != axis]) # coords without the current axis
centers[cluster_ids[coord_this_axis]][reduced_coords] = pXY.nonzero_elements[
coords] # (cluster_id, other coords) -> value
# assign rows to clusters
scores = np.zeros(shape=(pXY.N[axis], K[axis])) # scores: axis_size x cluster_number
denoms_P = np.zeros(shape=(pXY.N[axis]))
denoms_Q = np.zeros(shape=(K[axis]))
for coords in pXY.nonzero_elements:
coord_this_axis = coords[axis]
if coord_this_axis in center_indices:
continue # don't reassign cluster centers, please
reduced_coords = tuple([coords[i] for i in xrange(len(coords)) if i != axis])
for cluster_index in cluster_ids:
xhat = cluster_ids[cluster_index] # need cluster ID, not the axis index
if reduced_coords in centers[xhat]: # overlapping point
P_i = pXY.nonzero_elements[coords]
Q_i = centers[xhat][reduced_coords]
scores[coords[axis]][xhat] += P_i * Q_i # now doing based on cosine similarity
denoms_P[coords[axis]] += P_i * P_i # magnitude of this slice of original matrix
denoms_Q[xhat] += Q_i * Q_i # magnitude of cluster centers
# normalize scores
scores = divide(scores, outer(sqrt(denoms_P), sqrt(denoms_Q)))
scores[scores == 0] = -1.0
# add random jitter to scores to handle tie-breaking
scores += self.jitter_max * random_sample(scores.shape)
new_cXYi = list(scores.argmax(1)) # this needs to be argmax because cosine similarity
# make sure to assign the cluster centers to themselves
for center_index in cluster_ids:
new_cXYi[center_index] = cluster_ids[center_index]
# ensure numbers of clusters are correct
self.ensure_correct_number_clusters(new_cXYi, K[axis])
new_C[axis] = new_cXYi
return new_C
def ensure_correct_number_clusters(self, cXYi, expected_K):
""" To ensure a cluster assignment actually has the expected total number of clusters.
Args:
cXYi: the input cluster assignment
expected_K: expected number of clusters on this axis
Return:
None. The assignment will be changed in place in cXYi.
"""
clusters_represented = set()
for c in cXYi:
clusters_represented.add(c)
if len(clusters_represented) == expected_K:
return
for c in xrange(expected_K):
if c not in clusters_represented:
index_to_change = random.randint(0, len(cXYi) - 1)
cXYi[index_to_change] = c
self.ensure_correct_number_clusters(cXYi, expected_K)
def calculate_objective(self):
""" Calculate the value of the objective function given the current cluster assignments.
Return:
objective: the objective function value
"""
objective = 0.0
for d in self.pXY.nonzero_elements:
pXY_i = self.pXY.nonzero_elements[d]
qXY_i = self.get_element_approximate_distribution(d)
if qXY_i == 0:
print(pXY_i)
objective += pXY_i * math.log(pXY_i / qXY_i)
return objective
def get_element_approximate_distribution(self, coords):
""" Get the distribution approximated by q(X,Y). """
clusters = [self.cXY[i][coords[i]] for i in xrange(len(coords))]
element = self.qXhatYhat.get(tuple(clusters))
for i in xrange(len(coords)):
element *= self.qXxHat[i][coords[i]]
return element
def main():
""" An example run of EBC. """
with open("resources/matrix-ebc-paper-sparse.tsv", "r") as f:
data = []
for line in f:
sl = line.split("\t")
if len(sl) < 5: # headers
continue
data.append([sl[0], sl[2], float(sl[4])])
matrix = SparseMatrix([14052, 7272])
matrix.read_data(data)
matrix.normalize()
ebc = EBC(matrix, [30, 125], 10, 1e-10, 0.01)
cXY, objective, it = ebc.run()
if __name__ == "__main__":
main()