def smythEmissionDistribution(pair): """ Given a pair (S: list of sequences, target_m: int), get the emission distribution for Smyth's "default" HMM. target_m is an upper bound on the number of states -- if we can only have m' distinct observation values, then the distribution for a m' state HMM is returned. @param pair: A tuple of the form (S: list of sequences, m: int) @return: The corresponding emission distribution encoded as a list of (mu, stddev) pairs """ S, target_m = pair merged, distinct = prepareSeqs(S) m_prime = min(target_m, len(distinct)) centroids, labels, inertia = k_means(merged, m_prime, init='k-means++') clusters = partition(merged, labels) B = [] has_zero = False for cluster in clusters: assert len(cluster) > 0 mu = mean(cluster) stddev = std(cluster) B.append((mu, stddev)) if stddev < 0.001: has_zero = True return (B, labels, has_zero)
def smythEmissionDistribution(pair): """ Given a pair (S: list of sequences, target_m: int), get the emission distribution for Smyth's "default" HMM. target_m is an upper bound on the number of states -- if we can only have m' distinct observation values, then the distribution for a m' state HMM is returned. @param pair: A tuple of the form (S: list of sequences, target_m: int) @return: (B, labels, has_zero), where: * S', obs = concat(S), set(S) * m' = min(target_m, len(obs)) * [C_0,...,C_{m'-1}] = result of clustering S' with k-means. * labels: tells which cluster each item in merged goes into; i.e., labels[i] = j, where S'[i] belongs to cluster C_j. * B[i] = (mean(C_i), stddev(C_i)). * has_zero = True if there is i such that B[i][1] ~= 0.0. """ S, target_m = pair # merged list of 1d vectors, set of distinct observation values merged, distinct = prepareSeqs(S) # m_prime is min of either target_m or the number of distinct obs values m_prime = min(target_m, len(distinct)) # k-means partitions merged into m_prime clusters [C_0,...,C_{m'-1}]. # centroids = [c_0,...,c_{m'-1}]: cluster centers; i.e., c_i is the center # of C_j. # labels: tells which cluster each item in merged goes into; i.e., # labels[i] = j, where merged[i] belongs to cluster C_j. # inertia: sum of distances of samples to closest cluster center # inertia = sum_{i=0}^{m'-1}(sum_{x in C_i} dist(x, c_i)). centroids, labels, inertia = k_means(merged, m_prime, init='k-means++') # takes labels and arranges merged into # a list of lists, each of which contains the series from one cluster # clusters = [C_0,..,C_{m'-1}] clusters = partition(merged, labels) # Compute (B, labels, has_zero), where # B[i] = (mean(C_i), stddev(C_i)). # has_zero = True if there is i such that B[i][1] ~= 0.0. B = [] has_zero = False for cluster in clusters: assert len(cluster) > 0 mu = mean(cluster) stddev = std(cluster) B.append((mu, stddev)) if stddev < 0.001: has_zero = True return (B, labels, has_zero)
def _kMedoids(self): """ Create multiple partitions for k values in [self.min_k... self.max_k] via k-medoids. """ self.dist_matrix = self._getDistMatrix() batch_items = ((self.dist_matrix, k, 10) for k in self.k_values) printAndFlush("K-medoids clustering (parallel)...") results = self._doMap(kMedoids, batch_items) printAndFlush("done") for i in xrange(0, len(self.k_values)): k, result = self.k_values[i], results[i] labels, error, nfound = result self.labelings[k] = labels clusters = partition(self.S, labels) self.partitions[k] = (clusters)
def model(self): """ With the user specified k range, clustering algorithm, HMM intialization, and distance function, create a set of HMM mixtures modeling the sequences in self.S. When finished, self.components is populated with a dict mapping k values to HMM triples. """ start = clock() # self._cluster() for k in self.k_values: clusters = partition(self.S, self.labelings[k]) self.partitions[k] = (clusters) self._trainModels() self.times['total'] = clock() - start if not self.single_threaded: self.pool.close()
def _hierarchical(self): """ Create multiple partitions for k values in [self.min_k... self.max_k] via hierarchical, agglomerative clustering. """ self.dist_matrix = self._getDistMatrix() printAndFlush("Hierarchical clustering (serial)...") # tree = treecluster(distancematrix=self.dist_matrix, method='m') linkage_matrix = linkage(self.dist_matrix, method='complete') for k in self.k_values: # labels = tree.cut(k) labels = fcluster(linkage_matrix, k, 'maxclust') self.labelings[k] = labels clusters = partition(self.S, labels) # Technically, scipy's tree cutting function isn't guaranteed to # produce k clusters. It only seems to do this when there's a very # lopsided distance matrix, as was the case before we used log # observations. With log observations, it's been fine, and it # performs better than Pycluster's analogous routine. if len(clusters) != k: raise ValueError("fcluster could only produce %i clusters!" % len(clusters)) self.partitions[k] = clusters printAndFlush("done")
seq = list(model.sampleSingle(LEN, seed=j)) create = 0 destroy = LEN*WINDOW_SIZE records.append({ 'ident': (i, i), 'create': create, 'destroy': destroy, 'relays_in': [], 'relays_out': map(lambda o: max(0, exp(o)-1), seq) }) elif mode == "-clusters": data_path = sys.argv[5] with open(data_path) as data_file: orig_records = cPickle.load(data_file)['records'] labels = results['labelings'][k] clusters = partition(orig_records, labels) sampled = map(lambda c: sample(c, 100) if len(c) > 100 else c, clusters) for i, cluster in enumerate(sampled): for record in cluster: record['ident'] = (i, i) records.append(record) for record in records: if record['ident'] == (6, 6): print len(record['relays_out']) output = { 'window_size': WINDOW_SIZE, 'records': records } with open(out_path, 'w') as outfile: cPickle.dump(output, outfile, protocol=2)