def learn_likelihoods_progress(i, n, m, A, B, pi, F, X_train, nEmissionDim, g_mu, g_sig, nState): if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) l_likelihood_mean = 0.0 l_likelihood_mean2 = 0.0 l_statePosterior = np.zeros(nState) for j in xrange(n): g_post = np.zeros(nState) g_lhood = 0.0 g_lhood2 = 0.0 prop_sum = 0.0 for k in xrange(1, m): final_ts_obj = ghmm.EmissionSequence(F, X_train[j][:k*nEmissionDim]) logp = ml.loglikelihoods(final_ts_obj)[0] # print 'Log likelihood:', logp post = np.array(ml.posterior(final_ts_obj)) k_prop = norm(loc=g_mu, scale=g_sig).pdf(k) g_post += post[k-1] * k_prop g_lhood += logp * k_prop g_lhood2 += logp * logp * k_prop prop_sum += k_prop l_statePosterior += g_post / prop_sum / float(n) l_likelihood_mean += g_lhood / prop_sum / float(n) l_likelihood_mean2 += g_lhood2 / prop_sum / float(n) return i, l_statePosterior, l_likelihood_mean, np.sqrt(l_likelihood_mean2 - l_likelihood_mean**2)
def set_hmm_object(self, A, B, pi, out_a_num=None, vec_num=None, mat_num=None, u_denom=None): """Set HMM's hyper parameters """ if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ A, B, pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ A, B, pi) self.A = A self.B = B self.pi = pi try: self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num, u_denom) except: print "Install Daehyung's custom ghmm if you want partial fit functionalities." return self.ml
def __init__(self, preprocess_args, metric, graph_structure_type, A, B, pi, win_len, thresh, min_peak_dist): """ Args: preprocess_args: metric: graph_structure_type: "predefined", "fully", "left_to_right" A: initial hidden states graph B: initial hidden states distribution pi: initial hidden states probabilities win_len: windows lengths of the sliding window offline thresh: in the peak detection, detect peaks that are greater than thresh min_peak_dist: in the peak detection, detect peaks that are at least separated by minimum peak distance """ self.preprocess_args = preprocess_args self.metric = metric self.graph_structure_type = graph_structure_type self.A = A self.B = B self.pi = pi self.win_len = win_len self.thresh = thresh self.min_peak_dist = min_peak_dist self.emission_domain = ghmm.Float() self.emission_distr = ghmm.GaussianDistribution(self.emission_domain)
def newModel(states, randomize = True, startAtFirstState = False, \ feedForward = True): """newModel(states, obs, sigma) Make a new random model. """ pi = [1.0 / states] * states if startAtFirstState: pi = [0] * states pi[0] = 1 aMat = numpy.zeros((states, states), float) bMat = numpy.zeros((states, 2), float) if randomize: for i in range(states): for j in range(states): aMat[i][j] = random.random() if feedForward and (j != i + 1): aMat[i][j] = 0 if feedForward and (j == i + 1): aMat[i][j] = 1 for j in range(2): bMat[i][j] = random.random() aMat += 0.01 bMat += 0.01 m = ghmm.HMMFromMatrices(ghmm.Float(), \ ghmm.GaussianDistribution(ghmm.Float()), \ aMat, bMat, pi) return m
def create_model(self, flag, number_states): A, B, pi = self.calculate_A_B_pi(number_states, flag) # generate models from parameters model = ghmm.HMMFromMatrices(self.F,ghmm.GaussianDistribution(self.F), A, B, pi) #model = ghmm.HMMFromMatrices(F,ghmm.MultivariateGaussianDistribution(F), A, B, pi) return model
def fit(self, X_train, A=None, B=None, pi=None, B_dict=None, verbose=False): if A is None: if verbose: print "Generate new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() if B is None: if verbose: print "Generate new B matrix" # We should think about multivariate Gaussian pdf. self.mu, self.sig = self.vectors_to_mean_sigma(X_train, self.nState) # Emission probability matrix B = np.hstack([self.mu, self.sig]).tolist() # Must be [i,:] = [mu, sig] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.] * self.nState pi[0] = 1.0 # HMM model object self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), A, B, pi) ## print "Run Baum Welch method with (samples, length)", X_train.shape train_seq = X_train.tolist() final_seq = ghmm.SequenceSet(self.F, train_seq) self.ml.baumWelch(final_seq, 10000) [self.A,self.B,self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) ## self.mean_path_plot(mu[:,0], sigma[:,0]) ## print "Completed to fitting", np.array(final_seq).shape # state range self.state_range = np.arange(0, self.nState, 1) # Pre-computation for PHMM variables self.mu_z = np.zeros((self.nState)) self.mu_z2 = np.zeros((self.nState)) self.mu_z3 = np.zeros((self.nState)) self.var_z = np.zeros((self.nState)) self.sig_z3 = np.zeros((self.nState)) for i in xrange(self.nState): zp = self.A[i,:]*self.state_range self.mu_z[i] = np.sum(zp) self.mu_z2[i] = self.mu_z[i]**2 #self.mu_z3[i] = self.mu_z[i]**3 self.var_z[i] = np.sum(zp*self.state_range) - self.mu_z[i]**2
def computeLikelihood(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=1, \ bPosterior=False, converted_X=False, cov_type='full'): ''' This function will be deprecated. Please, use computeLikelihoods. ''' if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) if cov_type == 'diag' or cov_type.find('diag') >= 0: ml.setDiagonalCovariance(1) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) if converted_X is False: X_test = util.convert_sequence(X, emission=False) X_test = np.squeeze(X_test) X_test = X_test.tolist() else: X_test = X l_idx = [] l_likelihood = [] l_posterior = [] for i in xrange(startIdx, len(X_test) / nEmissionDim): final_ts_obj = ghmm.EmissionSequence(F, X_test[:i * nEmissionDim]) try: logp = ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(ml.posterior(final_ts_obj)) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" l_idx.append(i) l_likelihood.append(-100000000) if bPosterior: if len(l_posterior) == 0: l_posterior.append(list(pi)) else: l_posterior.append(l_posterior[-1]) ## return False, False # anomaly continue l_idx.append(i) l_likelihood.append(logp) if bPosterior: l_posterior.append(post[i - 1]) if bPosterior: return idx, l_idx, l_likelihood, l_posterior else: return idx, l_idx, l_likelihood
def ghmm_from_gaussian_hmm(hmm): hmm = deepcopy(hmm) domain = ghmm.Float() trans = hmm.transitionMatrix.tolist() init = hmm.initialProbabilities.tolist() emissions = [map(float, [d.mean, d.variance]) for d in hmm.emissionDistributions] # print init # print trans # print emissions return ghmm.HMMFromMatrices(emissionDomain=domain, distribution=ghmm.GaussianDistribution(domain), A=trans, B=emissions, pi=init)
def reset(self): """Reset the HMM object """ [A, B, pi] = self.ml.asMatrices() if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ A, B, pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ A, B, pi) self.A = A self.B = B self.pi = pi
def predict_from_single_seq(self, x, ref_num): ''' Input @ x: length #samples x known steps Output @ observation distribution: nDimension ''' # new emission for partial sequence B = [] for i in xrange(self.nState): B.append([ self.B[i][0][ref_num], self.B[i][1][ref_num * self.nEmissionDim + ref_num] ]) ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ self.A, B, self.pi) if type(x) is not list: x = x.tolist() final_ts_obj = ghmm.EmissionSequence(self.F, x) try: (alpha, scale) = ml.forward(final_ts_obj) except: print "No alpha is available !!" sys.exit() x_pred = [] for i in xrange(self.nEmissionDim): if i == ref_num: x_pred.append(x[-1]) else: src_cov_idx = ref_num * self.nEmissionDim + ref_num tgt_cov_idx = ref_num * self.nEmissionDim + i t_o = 0.0 for j in xrange(self.nState): m_j = self.B[j][0][i] + \ self.B[j][1][tgt_cov_idx]/self.B[j][1][src_cov_idx]*\ (x[-1]-self.B[j][0][ref_num]) t_o += alpha[-1][j] * m_j x_pred.append(t_o) return x_pred
def computeLikelihoods(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=2, \ bPosterior=False, converted_X=False, cov_type='full'): ''' Input: - X: dimension x length ''' if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) if cov_type == 'diag': ml.setDiagonalCovariance(1) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) X_test = util.convert_sequence(X, emission=False) X_test = np.squeeze(X_test) l_idx = [] l_likelihood = [] l_posterior = [] for i in xrange(startIdx, len(X[0])): final_ts_obj = ghmm.EmissionSequence( F, X_test[:i * nEmissionDim].tolist()) try: logp = ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(ml.posterior(final_ts_obj)) l_likelihood.append(logp) if bPosterior: l_posterior.append(post[i - 1]) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" ## return False, False # anomaly ## continue # we keep the state as the previous one l_likelihood.append(-1000000000000) if bPosterior: if len(l_posterior) == 0: l_posterior.append(list(pi)) else: l_posterior.append(l_posterior[-1]) l_idx.append(i) if bPosterior: return idx, l_idx, l_likelihood, l_posterior else: return idx, l_idx, l_likelihood
def computeLikelihood(F, k, data, g_mu, g_sig, nEmissionDim, A, B, pi): if nEmissionDim >= 2: hmm_ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) else: hmm_ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) final_ts_obj = ghmm.EmissionSequence(F, data) logp = hmm_ml.loglikelihoods(final_ts_obj)[0] post = np.array(hmm_ml.posterior(final_ts_obj)) k_prop = norm(loc=g_mu, scale=g_sig).pdf(k) g_post = post[k-1] * k_prop g_lhood = logp * k_prop g_lhood2 = logp * logp * k_prop prop_sum = k_prop # print np.shape(g_post), np.shape(g_lhood), np.shape(g_lhood2), np.shape(prop_sum) return g_post, g_lhood, g_lhood2, prop_sum
def conditional_prob2(self, x): ''' Input @ x: dim x length Output @ A list of conditional probabilities P(x_t|lambda) Only single sample works ''' from scipy.stats import norm, entropy # feature-wise conditional probability cond_prob = [] for i in xrange(self.nEmissionDim): # per feature B = [0] * self.nState for j in xrange(self.nState): B[j] = [ self.B[j][0][i], self.B[j][1][i * self.nEmissionDim + i] ] ml_src = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ self.A, B, self.pi) X_test = util.convert_sequence2(x[[i]], emission=False) X_test = np.squeeze(X_test) final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist()) logp = ml_src.loglikelihood(final_ts_obj) cond_prob.append(logp) ## # all ## X_test = util.convert_sequence2(x, emission=False) ## X_test = np.squeeze(X_test) ## final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist()) ## cond_prob.append( self.ml.loglikelihood(final_ts_obj) ) # min-max normalization cond_prob = np.array(cond_prob) return cond_prob
def _randomModels(k, states): """Make a set of k random models. These models are untrained with initial random values for all model matricies. """ f = ghmm.Float() pi = [0.1] * states aMat = numpy.zeros((states, states), float) bMat = numpy.zeros((states, 2), float) #TODO Change above for multivariate Gaussians models = [] for n in range(k): for i in range(states): for j in range(states): aMat[i][j] = random.random() for j in range(2): bMat[i][j] = random.random() m = ghmm.HMMFromMatrices(f, ghmm.GaussianDistribution(f), \ aMat, bMat, pi) models.append(m) return models
def obsToModel(observation, std=0.1): """Makes a model from a single observation vector. """ aMat = numpy.zeros((len(observation), len(observation)), float) bMat = numpy.zeros((len(observation), 2), float) pi = [0.05] * len(observation) pi[0] = 1.0 for i in range(len(observation)): bMat[i][0] = observation[i] bMat[i][1] = std for j in range(len(observation)): aMat[i][j] = random.random() * 0.3 if j == i + 1: aMat[i][j] = 0.9 m = ghmm.HMMFromMatrices(ghmm.Float(), \ ghmm.GaussianDistribution(ghmm.Float()), \ aMat, bMat, pi) m.normalize() return m
import ghmm import numpy import random numModels = 1 states = 5 obs = 2 f = ghmm.Float() pi = [0.1] * states aMat = numpy.zeros((states, states), float) bMat = numpy.zeros((states, obs), float) for i in range(states): for j in range(states): aMat[i][j] = random.random() for j in range(obs): bMat[i][j] = random.random() bMat[0][0] = 5 bMat[0][1] = 3 model = ghmm.HMMFromMatrices(f, ghmm.GaussianDistribution(f), aMat, bMat, pi)
def _kMeans(data, k, states, iterations = 20, stopThreshold = 0.01, \ rOutliers = True, printBest = True, verbose = True, \ iType = "kmeans++"): bestScore = -100 bestModels = None bestData = None oldScore = -100 models = [] tdata = _randomAssign(data, k) if iType == "random": models = _randomModels(k, states) models = _trainModels(tdata, models) if iType == "kmeans++": models = _initializeGoodModels(data, k, states) tdata = _optimalAssign(tdata, models) outliers = [] for i in range(iterations): models = _trainModels(tdata, models) score = _fitness(tdata, models) if verbose: print " " + str(i) + ": " + str(score) if (score > bestScore) or (bestScore == -100): bestScore = score bestModels = list(ghmm.HMMFromMatrices(ghmm.Float(), \ ghmm.GaussianDistribution(ghmm.Float()), \ m.asMatrices()[0], \ m.asMatrices()[1], \ m.asMatrices()[2]) for m in models) bestData = list(list(v) for v in tdata) bestOutliers = list(outliers) if (oldScore == -100) or (score - oldScore) > stopThreshold: tdata = _optimalAssign(tdata, models) oldScore = score if rOutliers: _removeOutliers(models, tdata) else: if verbose: print "Resetting all" tdata = _randomAssign(data, k) if iType == "random": models = _randomModels(k, states) models = _trainModels(tdata, models) if iType == "kmeans++": models = _initializeGoodModels(data, k, states) tdata = _optimalAssign(tdata, models) oldScore = -100 if printBest or verbose: print "Average inter-cluster distance:" + str(bestScore) if rOutliers: if verbose: print "Number outliers found:" + str(len(bestOutliers)) #For the best set of models and train data try to include any outliers #again. Then return the models, data and outliers. bestData, bestOutliers = _includeOutliers(bestModels, bestData, bestOutliers) bestModels = _trainModels(bestData, bestModels) bestData = _optimalAssign(bestData, bestModels) score = _fitness(bestData, bestModels) if printBest or verbose: print "Score with additional outliers:" + str(score) if verbose: print "New number of outliers:" + str(len(bestOutliers)) import pybb.model.hmm bm = [] for m in bestModels: bm.append(pybb.model.hmm.Hmm(m)) return bm, bestData, bestOutliers
def fit(self, xData1, A=None, B=None, pi=None, cov_mult=[1.0]*1, verbose=False, \ ml_pkl='ml_temp_1d.pkl', use_pkl=False): ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl) X1 = np.array(xData1) if A is None: if verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() # print 'A', A if B is None: if verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mu, sig = self.vectors_to_mean_sigma(X1, self.nState) B = np.vstack([mu, sig * cov_mult[0] ]).T.tolist() # Must be [i,:] = [mu, sig] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # print 'Generating HMM' # HMM model object self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), A, B, pi) X_train = X1.tolist() print 'Run Baum Welch method with (samples, length)', np.shape(X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) print 'Baum Welch return:', ret if np.isnan(ret): return 'Failure' [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) #--------------- learning for anomaly detection ---------------------------- [A, B, pi] = self.ml.asMatrices() n, m = np.shape(X1) self.nGaussian = self.nState # Get average loglikelihood threshold wrt progress self.std_coff = 1.0 g_mu_list = np.linspace(0, m - 1, self.nGaussian) #, dtype=np.dtype(np.int16)) g_sig = float(m) / float(self.nGaussian) * self.std_coff # print 'g_mu_list:', g_mu_list # print 'g_sig:', g_sig ###################################################################################### if os.path.isfile(ml_pkl) and use_pkl: with open(ml_pkl, 'rb') as f: d = pickle.load(f) self.l_statePosterior = d[ 'state_post'] # time x state division self.ll_mu = d['ll_mu'] self.ll_std = d['ll_std'] else: if self.cluster_type == 'time': print 'Begining parallel job' r = Parallel(n_jobs=-1)(delayed(learn_likelihoods_progress)( i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim, g_mu_list[i], g_sig, self.nState) for i in xrange(self.nGaussian)) # r = [self.learn_likelihoods_progress_par(i, n, m, A, B, pi, X_train, g_mu_list[i], g_sig) for i in xrange(self.nGaussian)] print 'Completed parallel job' l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r) elif self.cluster_type == 'state': self.km = None self.ll_mu = None self.ll_std = None self.ll_mu, self.ll_std = self.state_clustering(X1) path_mat = np.zeros((self.nState, m * n)) likelihood_mat = np.zeros((1, m * n)) self.l_statePosterior = None d = dict() d['state_post'] = self.l_statePosterior d['ll_mu'] = self.ll_mu d['ll_std'] = self.ll_std ut.save_pickle(d, ml_pkl)
def fit(self, xData, A=None, B=None, pi=None, cov_mult=None, ml_pkl=None, use_pkl=False, cov_type='full', fixed_trans=0,\ shuffle=False): ''' Input : - xData: dimension x sample x length Issues: - If NaN is returned, the reason can be one of followings, -- lower cov -- small range of xData (you have to scale it up.) ''' # Daehyung: What is the shape and type of input data? if shuffle: X = xData X = np.swapaxes(X, 0, 1) id_list = range(len(X)) random.shuffle(id_list) X = np.array(X)[id_list] X = np.swapaxes(X, 0, 1) else: X = [np.array(data) for data in xData] nData = len(xData[0]) param_dict = {} # Load pre-trained HMM without training if use_pkl and ml_pkl is not None and os.path.isfile(ml_pkl): if self.verbose: print "Load HMM parameters without train the hmm" param_dict = ut.load_pickle(ml_pkl) self.A = param_dict['A'] self.B = param_dict['B'] self.pi = param_dict['pi'] if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ self.A, self.B, self.pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ self.A, self.B, self.pi) out_a_num = param_dict.get('out_a_num', None) vec_num = param_dict.get('vec_num', None) mat_num = param_dict.get('mat_num', None) u_denom = param_dict.get('u_denom', None) if out_a_num is not None: self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num, u_denom) return True else: if ml_pkl is None: ml_pkl = os.path.join(os.path.dirname(__file__), 'ml_temp_n.pkl') if cov_mult is None: cov_mult = [1.0] * (self.nEmissionDim**2) if A is None: if self.verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = util.init_trans_mat(self.nState).tolist() if B is None: if self.verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mus, cov = util.vectors_to_mean_cov(X, self.nState, self.nEmissionDim, cov_type=cov_type) ## print np.shape(mus), np.shape(cov) # cov: state x dim x dim for i in xrange(self.nEmissionDim): for j in xrange(self.nEmissionDim): cov[:, i, j] *= cov_mult[self.nEmissionDim * i + j] if self.verbose: for i, mu in enumerate(mus): print 'mu%i' % i, mu ## print 'cov', cov # Emission probability matrix B = [0] * self.nState for i in range(self.nState): if self.nEmissionDim > 1: B[i] = [[mu[i] for mu in mus]] B[i].append(cov[i].flatten().tolist()) else: B[i] = [np.squeeze(mus[0][i]), float(cov[i])] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # print 'Generating HMM' # HMM model object if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ A, B, pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ A, B, pi) if cov_type == 'diag': self.ml.setDiagonalCovariance(1) # print 'Creating Training Data' X_train = util.convert_sequence(X) # Training input X_train = X_train.tolist() if self.verbose: print "training data size: ", np.shape(X_train) if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape( X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) #, fixedTrans=fixed_trans) if np.isnan(ret): print 'Baum Welch return:', ret return 'Failure' print 'Baum Welch return:', ret / float(nData) [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) param_dict['A'] = self.A param_dict['B'] = self.B param_dict['pi'] = self.pi try: [out_a_num, vec_num, mat_num, u_denom] = self.ml.getBaumWelchParams() param_dict['out_a_num'] = out_a_num param_dict['vec_num'] = vec_num param_dict['mat_num'] = mat_num param_dict['u_denom'] = u_denom except: print "Install new ghmm!!" if ml_pkl is not None: ut.save_pickle(param_dict, ml_pkl) return ret / float(nData)
# state 4: definitive-non-T state if params['mixed_model'] == True: Emissionmatrix = [[[params['bound'] * 100.0, 0.0], [1.0, 1.0], [1.0, 0.0]], [[1.5, -1.0], [1.5, 1.5], [0.95, 0.05]], [[1.5, -1.0], [1.5, 1.5], [0.75, 0.25]], [[1.5, -1.0], [1.5, 1.5], [0.5, 0.5]], [[1.5, -1.0], [1.5, 1.5], [0.25, 0.75]]] # [p1_mean, p2,mean], [p1_std, p2_std], [P(p1), P(p2)] model = ghmm.HMMFromMatrices(F, ghmm.GaussianMixtureDistribution(F), Transitionmatrix, Emissionmatrix, pi) else: Emissionmatrix = [[params['bound'] * 100.0, 1.0], [2.0, 0.5], [1.0, 0.5], [-1.0, 0.5], [-2.0, 0.5]] # [mean, std] model = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), Transitionmatrix, Emissionmatrix, pi) print('Model before training:') print(model) mghmm_train = ghmm.SequenceSet(F, train_set) model.baumWelch(mghmm_train, 10000, 0.01) print('Model after training:') print(model) model.write(out_hmm) ###------------------------------------------------ ###------------------------------------------------ # calculate tail length using the mghmm model and write them to output files # dict_tl structure: {gene_name : [list of tail lengths]}