forked from japplebaum/thesis
/
smyth.py
548 lines (495 loc) · 18.5 KB
/
smyth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
"""
A parallel implementation of Smyth 1997's HMM clustering algorithm
for 1-dimensional time series data.
Syntax: python smyth.py infile target_m min_k max_k outpath
Models for the time series in infile are built and pickled to outpath. target_m,
min_k, and max_k are parameters for the modeling process. See HMMCluster's
documentation for explanations.
@author: Julian Applebaum
"""
from ghmm import Alphabet
from sklearn.cluster import k_means
from fastcluster import linkage
from Pycluster import kmedoids, treecluster
from scipy.cluster.hierarchy import fcluster
from scipy.spatial.distance import squareform
from numpy import std, mean, array, float32
from sample_gen import smyth_example
from cluster_utils import partition
from sequence_utils import *
from hmm_utils import *
from matrix_utils import uniformMatrix
# from levenshtein import levDistance
from pprint import pprint
from math import isnan
from multiprocessing import Pool
from itertools import izip, islice, ifilter
from time import clock
from random import uniform
import sys, cPickle
# Minimum standard deviation for a state in the clustering phase. Anything
# less than this leaves log likelihood prone to underflow errors.
EPSILON = .25
def validateTriple(triple):
"""
Check that a HMM has valid probability distributions.
@param triple: a triple (A, B, pi)
@return: True if there are no negative numbers and all distributions
sum to 1 (within .0001 tolerance), False otherwise
"""
A, B, pi = triple
for row in A:
if abs(sum(row)-1.0) > .0001:
raise ValueError('Row in A does not sum to 1: ' + str(A))
for a in row:
if a < 0: raise ValueError('Entry in A is negative: ' + str(A))
if abs(sum(pi)-1.0) > .0001:
raise ValueError('pi does not sum to 1: ' + str(pi))
for p in pi:
if p < 0: raise ValueError('pi has negative entry: ' + str(pi))
for (_, stddev) in B:
if stddev < .1:
raise ValueError('B has zero std. dev. entry: ' + str(B))
def correctDMMTransitions(A):
"""
Given a Discrete Markov Model, we may have a state at the end that hasn't
been visited before. Since this state's transitions are undefined, we just
give it a uniform distribution. If they are defined, leave it alone.
@param: The transition matrix
@return: The "corrected" transition matrix
"""
for i in xrange(0, len(A)):
if sum(A[i]) == 0:
A[i] = [1.0/len(A)] * len(A)
return A
def printAndFlush(string):
"""
Print a string and flush stdout.
"""
print string
sys.stdout.flush()
# These functions really belong as methods of HMMCluster, but we need to leave
# them at the module level for multiprocessing.
def prepareSeqs(S):
"""
Combine the observations from a set of sequences into a merged list of
1d vectors, and get the set of distinct observation values in one pass.
@param S: the set of sequences
@return: A pair (merged, distinct)
"""
distinct = set()
merged = []
for s in S:
for o in s:
merged.append([o])
distinct.add(o)
return (merged, distinct)
def smythEmissionDistribution(pair):
"""
Given a pair (S: list of sequences, target_m: int), get the emission
distribution for Smyth's "default" HMM. target_m is an upper bound on the
number of states -- if we can only have m' distinct observation values, then
the distribution for a m' state HMM is returned.
@param pair: A tuple of the form (S: list of sequences, target_m: int)
@return: (B, labels, has_zero), where:
* S', obs = concat(S), set(S)
* m' = min(target_m, len(obs))
* [C_0,...,C_{m'-1}] = result of clustering S' with k-means.
* labels: tells which cluster each item in merged goes into; i.e.,
labels[i] = j, where S'[i] belongs to cluster C_j.
* B[i] = (mean(C_i), stddev(C_i)).
* has_zero = True if there is i such that B[i][1] ~= 0.0.
"""
S, target_m = pair
# merged list of 1d vectors, set of distinct observation values
merged, distinct = prepareSeqs(S)
# m_prime is min of either target_m or the number of distinct obs values
m_prime = min(target_m, len(distinct))
# k-means partitions merged into m_prime clusters [C_0,...,C_{m'-1}].
# centroids = [c_0,...,c_{m'-1}]: cluster centers; i.e., c_i is the center
# of C_j.
# labels: tells which cluster each item in merged goes into; i.e.,
# labels[i] = j, where merged[i] belongs to cluster C_j.
# inertia: sum of distances of samples to closest cluster center
# inertia = sum_{i=0}^{m'-1}(sum_{x in C_i} dist(x, c_i)).
centroids, labels, inertia = k_means(merged, m_prime, init='k-means++')
# takes labels and arranges merged into
# a list of lists, each of which contains the series from one cluster
# clusters = [C_0,..,C_{m'-1}]
clusters = partition(merged, labels)
# Compute (B, labels, has_zero), where
# B[i] = (mean(C_i), stddev(C_i)).
# has_zero = True if there is i such that B[i][1] ~= 0.0.
B = []
has_zero = False
for cluster in clusters:
assert len(cluster) > 0
mu = mean(cluster)
stddev = std(cluster)
B.append((mu, stddev))
if stddev < 0.001:
has_zero = True
return (B, labels, has_zero)
def trainHMM(pair):
"""
Given a pair (S: list of sequences, target_m: int), train an HMM triple on S
with Baum-Welch with at most target_m states using Smyth's "default" method
for the initial HMM.
If the observations in S can be clustered into target_m non-empty cluster,
then the resulting model will have target_m states. Otherwise, the model
will have one state per non-empty cluster for however many clusters could
be created
A - Transition Probability matrix (N x N)
B - Observation Symbol Probablilty Distribution (N x M)
pi - Initial State Distribution Matrix (N x 1)
(N: # states in HMM, M: # observation symbols)
@param pair: A tuple of the form (S: list of sequences, target_m: int)
@return: The HMM as a (A, B, pi) triple
"""
cluster, target_m = pair
# get emission distribution B = [(mu, stddev), ...]
B, labels, has_zero = smythEmissionDistribution((cluster, target_m))
# also the number of clusters (created by k-means)
m_prime = len(B)
pi = [1.0/m_prime] * m_prime # ex: if m_prime = 4, pi = [0.25, 0.25, 0.25, 0.25]
# change from "or" to "and"?
# if the stddev is not zero and there is more than 1 item in the cluster,
# if not has_zero and len(cluster) > 1:
# m_prime x m_prime matrix filled with 1.0/m_prime -> each row sums up to 1
# Make sure stddev > EPSILON
B = map(lambda b: (b[0], max(b[1], EPSILON)), B)
# error if len(cluster) = 1.
#if len(cluster) > 1:
A = uniformMatrix(m_prime, m_prime, 1.0/m_prime)
hmm = tripleToHMM((A, B, pi))
hmm.baumWelch(toSequenceSet(cluster))
A_p, B_p, pi_p = hmmToTriple(hmm)
B_p = map(lambda b: (b[0], max(b[1], EPSILON)), B_p)
validateTriple((A_p, B_p, pi_p))
return ((A_p, B_p, pi_p))
'''
else:
# If we have a state with zero standard deviation, Baum Welch dies on
# a continuous HMM with overflow errors. To fix this, we replace each
# observation with its cluster label, then train a Discrete Markov
# Model on these sequences. We don't get to reestimate B at all, but
# we do get to reestimate the dynamics. This heuristic is only
# employed for single element clusters.
# from hmm_utils. Returns an HMM built from matrices
hmm = discreteDefaultDMM(min(labels), max(labels))
seq_lens = [len(seq) for seq in cluster]
offset = 0
label_seqs = [[] for seq in cluster] #initalize label_seqs
seq_idx = 0
for i, label in enumerate(labels):
if i == seq_lens[0] + offset:
offset += seq_lens.pop(0)
seq_idx += 1
label_seqs[seq_idx].append(label)
domain = Alphabet(range(min(labels), max(labels)+1))
hmm.baumWelch(toSequenceSet(label_seqs, domain))
A_p0, pi_p = getDynamics(hmm)
A_p = correctDMMTransitions(A_p0)
B_p = B
# According to the GHMM mailing list, a very small standard deviation
# can cause underflow errors when attempting to compute log likelihood.
# We avoid this by placing a floor sigma >= .5. It's a little hacky, but
# given the very fuzzy nature of our training data (considering network
# latency, etc.), it's not unreasonable to assume that "uniform"
# measurements could have some jitter. Any extra variance added to the
# cluster can always be corrected away with another round of Baum Welch.
# EPSILON = .5. b[1] = stddev
# make this if to encompass len(clusters) > 1??? and has_zero
#if len(cluster) == 1:
# B_p = map(lambda b: (b[0], max(b[1], EPSILON)), B)
triple = (A_p, B_p, pi_p)
validateTriple(triple)
return triple
'''
def randomDefaultTriple(pair):
pass
def symDistance(args):
"""
Calculate Rabiner's symmetrized distance measure between two sequences
given their corresponding "default" models.
@param args: A pair ((seq1, triple1), (seq2, triple2)) where seq1 and
seq2 are singleton lists of emission sequences, and triple1, triple2
are the corresponding HMM triples.
@return: The distance between seq1 and seq2
"""
pair1, pair2 = args
seq1, triple1 = pair1
seq2, triple2 = pair2
hmm1 = tripleToHMM(triple1)
hmm2 = tripleToHMM(triple2)
s1_m2 = hmm2.loglikelihood(toSequence(seq1))
s2_m1 = hmm1.loglikelihood(toSequence(seq2))
"""
if s1_m2 > 0:
print seq1, hmm2
"""
assert s1_m2 <= 0, ("s1_m2=%f\nseq1=%s\nhmm2=%s\n" % (s1_m2, seq1, hmm2))
assert s2_m1 <= 0, ("s2_m1=%f" % s2_m1)
return (s1_m2 + s2_m1)/2.0
def kMedoids(args):
"""
Do k-medoids clustering on a distance matrix.
@param args: A tuple of the form (dist_matrix, k, n_passes)
@return: The result tuple returned by Pycluster.kmedoids
"""
dist_matrix, k, n_passes = args
return kmedoids(dist_matrix, k, n_passes)
class HMMCluster():
def __init__(self, S, target_m, min_k, max_k, labels, dist_func='hmm',
hmm_init='smyth', n_jobs=None):
"""
@param S: The sequences to model
@param target_m: The desired number of components per HMM. The training
algorithm will attempt to create this many states, but
it is not guaranteed. See smythDefaultTriple for details.
@param min_k: The minimum number of mixture components to try
@param max_k: The maximum number of mixture components to try
@param labels: The labelings after clustering
@param dist_func: The distance function to use; either 'hmm' or
'editdistance'. 'hmm' is Rabiner's symmetrized measure.
@param hmm_init: Either 'smyth' or 'random'. 'smyth' causes HMMs to
be initialized with Smyth 1997's "default" method. 'random'
results in random transition matrices, emission distributions
and intial state distributions.
@param n_jobs: How many processes to spawn for parallel computations.
If None, cpu_count() processes are created.
"""
self.S = S
self.n = len(self.S)
self.target_m = target_m
self.min_k = min_k
self.max_k = max_k
self.labelings = labels
self.dist_func = dist_func
self.hmm_init = hmm_init
self._sanityCheck()
self.components = {}
self.composites = {}
self.dist_matrix = None
self.partitions = {}
self.k_values = range(self.min_k, self.max_k+1)
self.calc_ks = []
self.init_hmms = []
self.times = {}
self.single_threaded = n_jobs == -1
if not self.single_threaded:
self.pool = Pool(n_jobs)
def _sanityCheck(self):
assert self.min_k <= self.max_k
assert self.dist_func in ('hmm', 'editdistance')
assert self.hmm_init in ('smyth', 'random')
assert len(self.labelings) > 0
def _getHMMBatchItems(self):
for i in xrange(0, self.n):
for j in xrange(1+i, self.n):
pair_1 = (self.S[i][0], self.init_hmms[i])
pair_2 = (self.S[j][0], self.init_hmms[j])
yield (pair_1, pair_2)
def _doMap(self, func, items):
if self.single_threaded:
return map(func, items)
else:
return self.pool.map(func, items)
def _getHMMDistMatrix(self):
"""
Compute the distance matrix using Rabiner's HMM distance measure.
"""
# Train an HMM for each sequence in S in parallel. hmm_init and init_fn
# are poor name choices and need to be changed.
if self.hmm_init == 'smyth':
init_fn = trainHMM
elif self.hmm_init == 'random':
init_fn = randomDefaultTriple
printAndFlush("Generating default HMMs (parallel)...")
start = clock()
# inital hmm?
self.init_hmms = self._doMap(init_fn,
(([s[0]], self.target_m) for s in self.S))
self.times['init_hmms'] = clock() - start
printAndFlush("done")
# Compute distance matrix in parallel (in batches of 500,000).
n_batchitems = (self.n)*(self.n+1)/2 - self.n
condensed = []
printAndFlush("Computing distance matrix (parallel)...")
printAndFlush("Processing %i batch items" % n_batchitems)
start = clock()
# Split the distance matrix calculation into mini batches of 500,000
# pairs to avoid a bug in the multiprocessing API.
batch_size = 500000
for i in xrange(0, (n_batchitems)/batch_size + 1):
start, stop = batch_size*i, min(batch_size*(i+1), n_batchitems)
printAndFlush("Items %i-%i" % (start, stop))
dist_batch = self._getHMMBatchItems()
mini_batch = islice(dist_batch, start, stop)
condensed += self._doMap(symDistance, mini_batch)
self.times['distance_matrix'] = clock() - start
printAndFlush("done")
# log-likelihoods are <= 0, a distance function must be positive
shifted = map(lambda l: -1*l, condensed)
printAndFlush("Minimum distance: %f" % min(shifted))
printAndFlush("Maximum distance: %f" % max(shifted))
return array(shifted, float32)
# def _getEditDistMatrix(self):
# """
# Compute the distance matrix using edit distance between sequences.
# """
# dist_batch = []
# for i in xrange(0, self.n):
# for j in xrange(1+i, self.n):
# dist_batch.append((self.S[i], self.S[j]))
# printAndFlush("Computing distance matrix (parallel)...")
# start = clock()
# condensed = self._doMap(levDistance, dist_batch)
# self.times['distance_matrix'] = clock() - start
# printAndFlush("done")
# return condensed
def _getDistMatrix(self):
"""
Compute the distance matrix with a user specified distance function.
"""
if self.dist_func == 'hmm':
condensed = self._getHMMDistMatrix()
elif self.dist_func == 'editdistance':
condensed = self._getEditDistMatrix()
return condensed
def _hierarchical(self):
"""
Create multiple partitions for k values in [self.min_k... self.max_k]
via hierarchical, agglomerative clustering.
"""
self.dist_matrix = self._getDistMatrix()
printAndFlush("Hierarchical clustering (serial)...")
# tree = treecluster(distancematrix=self.dist_matrix, method='m')
linkage_matrix = linkage(self.dist_matrix, method='complete')
for k in self.k_values:
# labels = tree.cut(k)
labels = fcluster(linkage_matrix, k, 'maxclust')
self.labelings[k] = labels
clusters = partition(self.S, labels)
# Technically, scipy's tree cutting function isn't guaranteed to
# produce k clusters. It only seems to do this when there's a very
# lopsided distance matrix, as was the case before we used log
# observations. With log observations, it's been fine, and it
# performs better than Pycluster's analogous routine.
if len(clusters) != k:
raise ValueError("fcluster could only produce %i clusters!" %
len(clusters))
self.partitions[k] = clusters
printAndFlush("done")
def _kMedoids(self):
"""
Create multiple partitions for k values in [self.min_k... self.max_k]
via k-medoids.
"""
self.dist_matrix = self._getDistMatrix()
batch_items = ((self.dist_matrix, k, 10) for k in self.k_values)
printAndFlush("K-medoids clustering (parallel)...")
results = self._doMap(kMedoids, batch_items)
printAndFlush("done")
for i in xrange(0, len(self.k_values)):
k, result = self.k_values[i], results[i]
labels, error, nfound = result
self.labelings[k] = labels
clusters = partition(self.S, labels)
self.partitions[k] = (clusters)
def _cluster(self):
"""
Create multiple partitions for k values in [self.min_k... self.max_k]
with a user specified clustering algorithm.
"""
start = clock()
if self.clust_alg == 'hierarchical':
self._hierarchical()
elif self.clust_alg == 'kmedoids':
self._kMedoids()
self.times['clustering'] = clock() - start
def _trainModels(self):
"""
Train a HMM mixture on each of the k-partitions by separately training
an HMM on each cluster.
"""
batch_items = []
cluster_sizes = [] # size of clusters
seq_lens = [] # len of time series replaces actual times series
# e.g. [[1, 2, 3, 4],[2,3],[3,2,4,1,1]] -> [4, 2, 5]
cluster_ips = []
# Build a list of mapping items to submit as a bulk job
# for each k_value (predicted range of clusters)given
for k in self.k_values:
clusSeen = 0
# Grab the partition with time series split into k clusters
partition = self.partitions[k]
for cluster in partition:
series = []
ips = []
for item in cluster:
series.append(item[0])
ips.append(item[1])
cluster_sizes.append(len(series))
seq_lens.append(map(lambda s: len(s), series))
batch_items.append((series, self.target_m))
cluster_ips.append(ips)
clusSeen += 1
self.calc_ks.append(clusSeen)
# initialize components[k]
for k in self.k_values:
self.components[k] = {
'hmm_triples': [],
'cluster_sizes': [],
'seq_lens': [],
'cluster_ips': []
}
printAndFlush("Training components on clusters (parallel)...")
start = clock()
hmm_triples = self._doMap(trainHMM, batch_items)
self.times['modeling'] = clock() - start
printAndFlush("done")
idx = 0
# Reconstruct the mixtures for each k from the list of trained HMMS
# Some algorithms may produce fewer clusters than set
actualClusSize = 0
for k in self.k_values:
#for i in xrange(0, k):
for i in xrange(0, self.calc_ks[actualClusSize]):
cluster_size = cluster_sizes[idx]
inclust_seq_lens = seq_lens[idx]
hmm_triple = hmm_triples[idx]
cluster_ip = cluster_ips[idx]
self.components[k]['hmm_triples'].append(hmm_triple)
self.components[k]['cluster_sizes'].append(cluster_size)
self.components[k]['seq_lens'].append(inclust_seq_lens)
self.components[k]['cluster_ips'].append(cluster_ip)
idx += 1
actualClusSize += 1
print "done"
def model(self):
"""
With the user specified k range, clustering algorithm, HMM intialization,
and distance function, create a set of HMM mixtures modeling the
sequences in self.S. When finished, self.components is populated with a
dict mapping k values to HMM triples.
"""
start = clock()
# self._cluster()
for k in self.k_values:
clusters = partition(self.S, self.labelings[k])
self.partitions[k] = (clusters)
self._trainModels()
self.times['total'] = clock() - start
if not self.single_threaded:
self.pool.close()
if __name__ == "__main__":
print "Generating synthetic data...",
seqSet = smyth_example(Ns=(50, 200), lengths=(200, 200), seed=9)
print "done"
clust = HMMCluster(seqSetToList(seqSet), 2, 2, 2)
clust.model()
hmm = tripleToHMM(compositeTriple(clust.components[2]))
print hmm
hmm.baumWelch(seqSet)
print hmm