Exemplo n.º 1
0
def discreteDefaultDMM(min_label, max_label):
	sigma = ghmm.IntegerRange(min_label, max_label+1)
	alpha_len = max_label - min_label + 1
	A = uniformMatrix(alpha_len, alpha_len, 1.0/alpha_len)
	B = uniformMatrix(alpha_len, alpha_len)
	for i in xrange(0, alpha_len):
		B[i, i] = 1
	pi = [1.0/alpha_len]*alpha_len
	distr = ghmm.DiscreteDistribution(sigma)
	return ghmm.HMMFromMatrices(sigma, distr, A, B, pi)
Exemplo n.º 2
0
def trainHMM(pair):
	"""
	Given a pair (S: list of sequences, target_m: int), initialize a
	HMM triple with at most target_m states using Smyth's "default" method.
	If the observations in S can be clustered into target_m non-empty cluster,
	then the resulting model will have target_m states. Otherwise, the model
	will have one state per non-empty cluster for however many clusters could
	be created.

	@param pair: A tuple of the form (S: list of sequences, target_m: int)
	@return: The HMM as a (A, B, pi) triple
	"""
	cluster, target_m = pair
	B, labels, has_zero = smythEmissionDistribution((cluster, target_m))
	m_prime = len(B)
	pi = [1.0/m_prime] * m_prime
	if not has_zero or len(cluster) > 1:
		A = uniformMatrix(m_prime, m_prime, 1.0/m_prime)
		hmm = tripleToHMM((A, B, pi))
		hmm.baumWelch(toSequenceSet(cluster))
		A_p, B_p, pi_p = hmmToTriple(hmm)
	else:
		# If we have a state with zero standard deviation, Baum Welch dies on
		# a continuous HMM with overflow errors. To fix this, we replace each
		# observation with its cluster label, then train a Discrete Markov
		# Model on these sequences. We don't get to reestimate B at all, but
		# we do get to reestimate the dynamics. This heuristic is only
		# employed for single element clusters.
		hmm = discreteDefaultDMM(min(labels), max(labels))
		seq_lens = [len(seq) for seq in cluster]
		offset = 0
		label_seqs = [[] for seq in cluster]
		seq_idx = 0
		for i, label in enumerate(labels):
			if i == seq_lens[0] + offset:
				offset += seq_lens.pop(0)
				seq_idx += 1
			label_seqs[seq_idx].append(label)
		domain = Alphabet(range(min(labels), max(labels)+1))
		hmm.baumWelch(toSequenceSet(label_seqs, domain))
		A_p0, pi_p = getDynamics(hmm)
		A_p = correctDMMTransitions(A_p0)
		B_p = B
	# According to the GHMM mailing list, a very small standard deviation
	# can cause underflow errors when attempting to compute log likelihood.
	# We avoid this by placing a floor sigma >= .5. It's a little hacky, but
	# given the very fuzzy nature of our training data (considering network
  	# latency, etc.), it's not unreasonable to assume that "uniform"
	# measurements could have some jitter. Any extra variance added to the
	# cluster can always be corrected away with another round of Baum Welch.
	if len(cluster) == 1:
		B_p = map(lambda b: (b[0], max(b[1], EPSILON)), B)
	triple = (A_p, B_p, pi_p)
	if validateTriple(triple):
		return triple
	raise ValueError("Could not build a valid HMM! \n %s \n %s" % (
		tripleToHMM(triple), label_seqs))
Exemplo n.º 3
0
def getDynamics(hmm):
	cmodel = hmm.cmodel
	A = uniformMatrix(cmodel.N, cmodel.N)
	pi = []
	for i in xrange(0, cmodel.N):
		state = cmodel.getState(i)
		pi.append(state.pi)
		for j in xrange(0, cmodel.N):
			A[i,j] = state.getOutProb(j)
	return (A, pi)
Exemplo n.º 4
0
def trainHMM(pair):
	"""
	Given a pair (S: list of sequences, target_m: int), train an HMM triple on S
	with Baum-Welch with at most target_m states using Smyth's "default" method
	for the initial HMM.

	If the observations in S can be clustered into target_m non-empty cluster,
	then the resulting model will have target_m states. Otherwise, the model
	will have one state per non-empty cluster for however many clusters could
	be created

	A - Transition Probability matrix (N x N) 
	B - Observation Symbol Probablilty Distribution (N x M)
	pi - Initial State Distribution Matrix (N x 1)
	(N: # states in HMM, M: # observation symbols)

	@param pair: A tuple of the form (S: list of sequences, target_m: int)
	@return: The HMM as a (A, B, pi) triple
	"""
	cluster, target_m = pair
	# get emission distribution B = [(mu, stddev), ...]
	B, labels, has_zero = smythEmissionDistribution((cluster, target_m))
	# also the number of clusters (created by k-means)
	m_prime = len(B)
	pi = [1.0/m_prime] * m_prime # ex: if m_prime = 4, pi = [0.25, 0.25, 0.25, 0.25]
	# change from "or" to "and"?
	# if the stddev is not zero and there is more than 1 item in the cluster,
	# if not has_zero and len(cluster) > 1:
		# m_prime x m_prime matrix filled with 1.0/m_prime -> each row sums up to 1

	# Make sure stddev > EPSILON
	B = map(lambda b: (b[0], max(b[1], EPSILON)), B)


	# error if len(cluster) = 1. 
	#if len(cluster) > 1:
	A = uniformMatrix(m_prime, m_prime, 1.0/m_prime)
	hmm = tripleToHMM((A, B, pi))
	hmm.baumWelch(toSequenceSet(cluster))
	A_p, B_p, pi_p = hmmToTriple(hmm)
	B_p = map(lambda b: (b[0], max(b[1], EPSILON)), B_p)
	validateTriple((A_p, B_p, pi_p))
	return ((A_p, B_p, pi_p))

	'''	
Exemplo n.º 5
0
def conf_inter(sigma, n):
	return 2*stats.t.isf([.05], n-1)[0]*sigma

def get_diff(i, means):
	return 100*(means[i]-means[i-1])/means[i-1]

if __name__ == "__main__":
	results_path = sys.argv[1]
	mode = sys.argv[2]
	with open(results_path) as results_file:
		results = cPickle.load(results_file)
		ks = sorted(set(map(lambda r: r[0], results)))
		ms = sorted(set(map(lambda r: r[1], results)))
		n_trials = len(set(map(lambda r: r[2], results)))
		km_means, km_trials = {}, {}
		sfc_zs = uniformMatrix(len(ms), len(ks))
		best_mean = float("-inf")
		global_best = float("-inf")
		best_m, best_k = None, None
		best_m_means, best_m_confs = [], []
		k_diffs = []
		for k in ks:
			for m in ms:
				km_means[(k, m)] = 0
				km_trials[(k, m)] = []
		for k, target_m, rand_seed, likelihood in results:
			km_means[(k, target_m)] += likelihood/n_trials
			km_trials[(k, target_m)].append(likelihood)
		for k in ks:
			for m in ms:
				mean_l = km_means[(k, m)]