def predict(self, X): ''' Input @ X: dimension x sample x length #samples x known steps Output @ observation distribution: mu, var #samples x 1 [list] ''' # sample x some length X_test = util.convert_sequence(X, emission=False) mu_l = [] cov_l = [] for i in xrange(len(X_test)): # Past profile final_ts_obj = ghmm.EmissionSequence(self.F, X_test[i].tolist()) try: # alpha: X_test length y #latent States at the moment t when state i is ended # test_profile_length x number_of_hidden_state (alpha, scale) = self.ml.forward(final_ts_obj) alpha = np.array(alpha) scale = np.array(scale) except: print "No alpha is available !!" sys.exit() ## continue f = lambda x: round(x, 12) for j in range(len(alpha)): alpha[j] = map(f, alpha[j]) alpha[-1] = map(f, alpha[-1]) n = len(X_test[i]) t_mu = np.zeros(self.nEmissionDim) t_cov = np.zeros(self.nEmissionDim * self.nEmissionDim) t_sum = 0.0 for j in xrange(self.nState): # N+1 total = np.sum( self.A[:, j] * alpha[n / self.nEmissionDim - 1, :]) #* scaling_factor [mu, cov] = self.B[j] t_mu += np.array(mu) * total t_cov += np.array(cov) * (total**2) t_sum += total mu_l.append(t_mu.tolist()) cov_l.append(t_cov.tolist()) return mu_l, cov_l
def loglikelihoods(self, X, bPosterior=False, bIdx=False, startIdx=1): ''' @ X: dimension x sample x length @ bIdx: enable to return indices return: the likelihoods over time (in single data) ''' # sample x some length X_test = util.convert_sequence(X, emission=False) return self.loglikelihoods_from_seqs(X_test, bPosterior=bPosterior, bIdx=bIdx, startIdx=startIdx)
def computeLikelihood(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=1, \ bPosterior=False, converted_X=False, cov_type='full'): ''' This function will be deprecated. Please, use computeLikelihoods. ''' if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) if cov_type == 'diag' or cov_type.find('diag') >= 0: ml.setDiagonalCovariance(1) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) if converted_X is False: X_test = util.convert_sequence(X, emission=False) X_test = np.squeeze(X_test) X_test = X_test.tolist() else: X_test = X l_idx = [] l_likelihood = [] l_posterior = [] for i in xrange(startIdx, len(X_test) / nEmissionDim): final_ts_obj = ghmm.EmissionSequence(F, X_test[:i * nEmissionDim]) try: logp = ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(ml.posterior(final_ts_obj)) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" l_idx.append(i) l_likelihood.append(-100000000) if bPosterior: if len(l_posterior) == 0: l_posterior.append(list(pi)) else: l_posterior.append(l_posterior[-1]) ## return False, False # anomaly continue l_idx.append(i) l_likelihood.append(logp) if bPosterior: l_posterior.append(post[i - 1]) if bPosterior: return idx, l_idx, l_likelihood, l_posterior else: return idx, l_idx, l_likelihood
def computeLikelihoods(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=2, \ bPosterior=False, converted_X=False, cov_type='full'): ''' Input: - X: dimension x length ''' if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) if cov_type == 'diag': ml.setDiagonalCovariance(1) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) X_test = util.convert_sequence(X, emission=False) X_test = np.squeeze(X_test) l_idx = [] l_likelihood = [] l_posterior = [] for i in xrange(startIdx, len(X[0])): final_ts_obj = ghmm.EmissionSequence( F, X_test[:i * nEmissionDim].tolist()) try: logp = ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(ml.posterior(final_ts_obj)) l_likelihood.append(logp) if bPosterior: l_posterior.append(post[i - 1]) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" ## return False, False # anomaly ## continue # we keep the state as the previous one l_likelihood.append(-1000000000000) if bPosterior: if len(l_posterior) == 0: l_posterior.append(list(pi)) else: l_posterior.append(l_posterior[-1]) l_idx.append(i) if bPosterior: return idx, l_idx, l_likelihood, l_posterior else: return idx, l_idx, l_likelihood
def loglikelihood(self, X, bPosterior=False): ''' shape? return: the likelihood of a sequence ''' X_test = util.convert_sequence(X, emission=False) X_test = np.squeeze(X_test) final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist()) try: logp = self.ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(self.ml.posterior(final_ts_obj)) except: print 'Likelihood error!!!!' if bPosterior: return None, None return None if bPosterior: return logp, post return logp
def partial_fit(self, xData, learningRate=0.2, nrSteps=1, max_iter=100): ''' Online update of HMM using online Baum-Welch algorithm ''' X = [np.array(data) for data in xData] nData = len(X[0]) # print 'Creating Training Data' X_train = util.convert_sequence(X) # Training input X_train = X_train.tolist() if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape( X_train) if learningRate < 1e-5: learningRate = 1e-5 final_seq = ghmm.SequenceSet(self.F, X_train) for i in xrange(max_iter): ret = self.ml.baumWelch(final_seq, nrSteps=nrSteps, learningRate=learningRate) if np.isnan(ret): print 'Baum Welch return:', ret return 'Failure' if i > 0: if abs(last_ret - ret) < 1.0: print "Partial fitting is converged to ", ret, " from ", last_ret break last_ret = ret print 'Baum Welch return:', ret / float(nData) [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) return ret
def getLoglikelihoods(self, xData, posterior=False, startIdx=1, n_jobs=-1): ''' shape? ''' warnings.simplefilter("always", DeprecationWarning) X = [np.array(data) for data in xData] X_test = util.convert_sequence(X) # Training input X_test = X_test.tolist() n, _ = np.shape(X[0]) # Estimate loglikelihoods and corresponding posteriors r = Parallel(n_jobs=n_jobs)(delayed(computeLikelihood)(i, self.A, self.B, self.pi, self.F, X_test[i], \ self.nEmissionDim, self.nState,\ startIdx=startIdx,\ bPosterior=posterior, converted_X=True) for i in xrange(n)) if posterior: _, ll_idx, ll_logp, ll_post = zip(*r) return ll_idx, ll_logp, ll_post else: _, ll_idx, ll_logp = zip(*r) return ll_idx, ll_logp
cov[:, i, j] *= cov_mult[nEmissionDim*i + j] # Emission probability matrix B = [0] * nState for i in range(nState): B[i] = [[mu[i] for mu in mus]] B[i].append(cov[i].flatten()) # pi - initial probabilities per state pi = [0.0] * nState pi[0] = 1.0 ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) print 'Creating Training Data' X_train = util.convert_sequence(X) # Training input X_train = X_train.tolist() print "training data size: ", np.shape(X_train) ## ml.cmodel.getState(0).setOutProb(1, 0, 0.8) ## print ml.cmodel.getState(0).getOutProb(1) ## print ml.cmodel.getState(0).getOutNum(1) if cov_type=='diag': ml.setDiagonalCovariance(0) final_seq = ghmm.SequenceSet(F, X_train) print 'Run Baum Welch method with (samples, length)', np.shape(X_train) ret = ml.baumWelch(final_seq, 10000, fixedTrans=1) ######################### Test ########################################### [A_new, B_new, pi_new] = ml.asMatrices()
def fit(self, xData, A=None, B=None, pi=None, cov_mult=None, ml_pkl=None, use_pkl=False, cov_type='full', fixed_trans=0,\ shuffle=False): ''' Input : - xData: dimension x sample x length Issues: - If NaN is returned, the reason can be one of followings, -- lower cov -- small range of xData (you have to scale it up.) ''' # Daehyung: What is the shape and type of input data? if shuffle: X = xData X = np.swapaxes(X, 0, 1) id_list = range(len(X)) random.shuffle(id_list) X = np.array(X)[id_list] X = np.swapaxes(X, 0, 1) else: X = [np.array(data) for data in xData] nData = len(xData[0]) param_dict = {} # Load pre-trained HMM without training if use_pkl and ml_pkl is not None and os.path.isfile(ml_pkl): if self.verbose: print "Load HMM parameters without train the hmm" param_dict = ut.load_pickle(ml_pkl) self.A = param_dict['A'] self.B = param_dict['B'] self.pi = param_dict['pi'] if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ self.A, self.B, self.pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ self.A, self.B, self.pi) out_a_num = param_dict.get('out_a_num', None) vec_num = param_dict.get('vec_num', None) mat_num = param_dict.get('mat_num', None) u_denom = param_dict.get('u_denom', None) if out_a_num is not None: self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num, u_denom) return True else: if ml_pkl is None: ml_pkl = os.path.join(os.path.dirname(__file__), 'ml_temp_n.pkl') if cov_mult is None: cov_mult = [1.0] * (self.nEmissionDim**2) if A is None: if self.verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = util.init_trans_mat(self.nState).tolist() if B is None: if self.verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mus, cov = util.vectors_to_mean_cov(X, self.nState, self.nEmissionDim, cov_type=cov_type) ## print np.shape(mus), np.shape(cov) # cov: state x dim x dim for i in xrange(self.nEmissionDim): for j in xrange(self.nEmissionDim): cov[:, i, j] *= cov_mult[self.nEmissionDim * i + j] if self.verbose: for i, mu in enumerate(mus): print 'mu%i' % i, mu ## print 'cov', cov # Emission probability matrix B = [0] * self.nState for i in range(self.nState): if self.nEmissionDim > 1: B[i] = [[mu[i] for mu in mus]] B[i].append(cov[i].flatten().tolist()) else: B[i] = [np.squeeze(mus[0][i]), float(cov[i])] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # print 'Generating HMM' # HMM model object if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ A, B, pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ A, B, pi) if cov_type == 'diag': self.ml.setDiagonalCovariance(1) # print 'Creating Training Data' X_train = util.convert_sequence(X) # Training input X_train = X_train.tolist() if self.verbose: print "training data size: ", np.shape(X_train) if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape( X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) #, fixedTrans=fixed_trans) if np.isnan(ret): print 'Baum Welch return:', ret return 'Failure' print 'Baum Welch return:', ret / float(nData) [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) param_dict['A'] = self.A param_dict['B'] = self.B param_dict['pi'] = self.pi try: [out_a_num, vec_num, mat_num, u_denom] = self.ml.getBaumWelchParams() param_dict['out_a_num'] = out_a_num param_dict['vec_num'] = vec_num param_dict['mat_num'] = mat_num param_dict['u_denom'] = u_denom except: print "Install new ghmm!!" if ml_pkl is not None: ut.save_pickle(param_dict, ml_pkl) return ret / float(nData)