def discreteDefaultDMM(min_label, max_label): sigma = ghmm.IntegerRange(min_label, max_label+1) alpha_len = max_label - min_label + 1 A = uniformMatrix(alpha_len, alpha_len, 1.0/alpha_len) B = uniformMatrix(alpha_len, alpha_len) for i in xrange(0, alpha_len): B[i, i] = 1 pi = [1.0/alpha_len]*alpha_len distr = ghmm.DiscreteDistribution(sigma) return ghmm.HMMFromMatrices(sigma, distr, A, B, pi)
def trainHMM(pair): """ Given a pair (S: list of sequences, target_m: int), initialize a HMM triple with at most target_m states using Smyth's "default" method. If the observations in S can be clustered into target_m non-empty cluster, then the resulting model will have target_m states. Otherwise, the model will have one state per non-empty cluster for however many clusters could be created. @param pair: A tuple of the form (S: list of sequences, target_m: int) @return: The HMM as a (A, B, pi) triple """ cluster, target_m = pair B, labels, has_zero = smythEmissionDistribution((cluster, target_m)) m_prime = len(B) pi = [1.0/m_prime] * m_prime if not has_zero or len(cluster) > 1: A = uniformMatrix(m_prime, m_prime, 1.0/m_prime) hmm = tripleToHMM((A, B, pi)) hmm.baumWelch(toSequenceSet(cluster)) A_p, B_p, pi_p = hmmToTriple(hmm) else: # If we have a state with zero standard deviation, Baum Welch dies on # a continuous HMM with overflow errors. To fix this, we replace each # observation with its cluster label, then train a Discrete Markov # Model on these sequences. We don't get to reestimate B at all, but # we do get to reestimate the dynamics. This heuristic is only # employed for single element clusters. hmm = discreteDefaultDMM(min(labels), max(labels)) seq_lens = [len(seq) for seq in cluster] offset = 0 label_seqs = [[] for seq in cluster] seq_idx = 0 for i, label in enumerate(labels): if i == seq_lens[0] + offset: offset += seq_lens.pop(0) seq_idx += 1 label_seqs[seq_idx].append(label) domain = Alphabet(range(min(labels), max(labels)+1)) hmm.baumWelch(toSequenceSet(label_seqs, domain)) A_p0, pi_p = getDynamics(hmm) A_p = correctDMMTransitions(A_p0) B_p = B # According to the GHMM mailing list, a very small standard deviation # can cause underflow errors when attempting to compute log likelihood. # We avoid this by placing a floor sigma >= .5. It's a little hacky, but # given the very fuzzy nature of our training data (considering network # latency, etc.), it's not unreasonable to assume that "uniform" # measurements could have some jitter. Any extra variance added to the # cluster can always be corrected away with another round of Baum Welch. if len(cluster) == 1: B_p = map(lambda b: (b[0], max(b[1], EPSILON)), B) triple = (A_p, B_p, pi_p) if validateTriple(triple): return triple raise ValueError("Could not build a valid HMM! \n %s \n %s" % ( tripleToHMM(triple), label_seqs))
def getDynamics(hmm): cmodel = hmm.cmodel A = uniformMatrix(cmodel.N, cmodel.N) pi = [] for i in xrange(0, cmodel.N): state = cmodel.getState(i) pi.append(state.pi) for j in xrange(0, cmodel.N): A[i,j] = state.getOutProb(j) return (A, pi)
def trainHMM(pair): """ Given a pair (S: list of sequences, target_m: int), train an HMM triple on S with Baum-Welch with at most target_m states using Smyth's "default" method for the initial HMM. If the observations in S can be clustered into target_m non-empty cluster, then the resulting model will have target_m states. Otherwise, the model will have one state per non-empty cluster for however many clusters could be created A - Transition Probability matrix (N x N) B - Observation Symbol Probablilty Distribution (N x M) pi - Initial State Distribution Matrix (N x 1) (N: # states in HMM, M: # observation symbols) @param pair: A tuple of the form (S: list of sequences, target_m: int) @return: The HMM as a (A, B, pi) triple """ cluster, target_m = pair # get emission distribution B = [(mu, stddev), ...] B, labels, has_zero = smythEmissionDistribution((cluster, target_m)) # also the number of clusters (created by k-means) m_prime = len(B) pi = [1.0/m_prime] * m_prime # ex: if m_prime = 4, pi = [0.25, 0.25, 0.25, 0.25] # change from "or" to "and"? # if the stddev is not zero and there is more than 1 item in the cluster, # if not has_zero and len(cluster) > 1: # m_prime x m_prime matrix filled with 1.0/m_prime -> each row sums up to 1 # Make sure stddev > EPSILON B = map(lambda b: (b[0], max(b[1], EPSILON)), B) # error if len(cluster) = 1. #if len(cluster) > 1: A = uniformMatrix(m_prime, m_prime, 1.0/m_prime) hmm = tripleToHMM((A, B, pi)) hmm.baumWelch(toSequenceSet(cluster)) A_p, B_p, pi_p = hmmToTriple(hmm) B_p = map(lambda b: (b[0], max(b[1], EPSILON)), B_p) validateTriple((A_p, B_p, pi_p)) return ((A_p, B_p, pi_p)) '''
def conf_inter(sigma, n): return 2*stats.t.isf([.05], n-1)[0]*sigma def get_diff(i, means): return 100*(means[i]-means[i-1])/means[i-1] if __name__ == "__main__": results_path = sys.argv[1] mode = sys.argv[2] with open(results_path) as results_file: results = cPickle.load(results_file) ks = sorted(set(map(lambda r: r[0], results))) ms = sorted(set(map(lambda r: r[1], results))) n_trials = len(set(map(lambda r: r[2], results))) km_means, km_trials = {}, {} sfc_zs = uniformMatrix(len(ms), len(ks)) best_mean = float("-inf") global_best = float("-inf") best_m, best_k = None, None best_m_means, best_m_confs = [], [] k_diffs = [] for k in ks: for m in ms: km_means[(k, m)] = 0 km_trials[(k, m)] = [] for k, target_m, rand_seed, likelihood in results: km_means[(k, target_m)] += likelihood/n_trials km_trials[(k, target_m)].append(likelihood) for k in ks: for m in ms: mean_l = km_means[(k, m)]