def fit_hmm_learn(seqs, n_states, axis): """ Seqs is a list of numpy vectors """ samples = np.concatenate(seqs) lengths = np.array([len(s) for s in seqs]) if len(samples) < n_states: return float('inf'), float('-inf'), None, None # assert len(samples) >= n_states hmm = GaussianHMM(n_components=n_states) hmm.fit(samples, lengths) ll = hmm.score(samples, lengths) _, labels = hmm.decode(samples, lengths) axis.set_title("HMM Learn (ll=%0.2f)" % ll) # ax2.plot(means[:, 0], means[:, 1], 'ro') # ax2.plot(X[:, :, 0], X[:, :, 1], 'bo') possible_colors = ['orange', 'blue', 'green', 'red'] colors = [possible_colors[e] for e in labels] axis.scatter(seqs[:100, :, 0], seqs[:100, :, 1], color=colors[:100], marker='^') axis.scatter(seqs[100:200, :, 0], seqs[100:200, :, 1], color=colors[100:200], marker='o') axis.scatter(seqs[200:, :, 0], seqs[200:, :, 1], color=colors[200:], marker='s') return labels
def test_GaussHMM_decode(cases: str) -> None: np.random.seed(12346) cases = int(cases) i = 1 N_decimal = 4 max_iter = 100 tol=1e-3 while i < cases: n_samples = np.random.randint(10, 50) hidden_states = np.random.randint(3, 6) n_features = np.random.randint(4, 9) X = [] lengths = [] for _ in range(n_samples): seq_length = np.random.randint(4, 9) this_x = np.random.rand(seq_length,n_features) X.append(this_x) lengths.append(seq_length) hmm_gold = GaussianHMM(n_components=hidden_states, covariance_type='full', algorithm='viterbi', n_iter=max_iter, tol=tol) X_gold = np.concatenate(X) hmm_gold.fit(X_gold, lengths) gold_means = hmm_gold.means_ gold_pi = hmm_gold.startprob_ gold_n_features = hmm_gold.n_features gold_transmat = hmm_gold.transmat_ gold_means = hmm_gold.means_ gold_covars = hmm_gold.covars_ hmm_mine = GaussHMM(hidden_states=hidden_states, A=gold_transmat, n_features=gold_n_features, means=gold_means, covar=gold_covars, pi=gold_pi, tol=tol, max_iter=max_iter) gold_logprob,gold_state_seq = hmm_gold.decode(X_gold, lengths) mine_logprob_list = [] mine_state_seq_list = [] for this_x in X: this_logprob, this_state_seq = hmm_mine.decode(this_x) mine_logprob_list.append(this_logprob) mine_state_seq_list.append(this_state_seq) mine_logprob = sum(mine_logprob_list) mine_state_seq = np.concatenate(mine_state_seq_list) assert_almost_equal(mine_logprob, gold_logprob, decimal=N_decimal) assert_almost_equal(mine_state_seq, gold_state_seq, decimal=N_decimal) i+=1 print('Successfully testing the decode function in Gaussian HMM!')
def fit_and_apply_hmm(normal, infected, chosen, data): # define sliding window size and number of components win, components = 4, 5 # uncomment the next line to find the optimal window size and number of components # it takes some time though... # win, components = find_optimal_params(chosen) win_data = get_windows(chosen, win) # learn a Gaussian Hidden Markov Model with 4 states from the infected host data hmm = GaussianHMM(n_components=components) hmm.fit(win_data) # store the log-likelihood of the host that trained the model modeled_log_likelihood = hmm.decode(win_data)[0] hosts_log_likelihood = {} # compute log-likelihood of data sequence of normal IPs for ip in normal: # get the flows of that host only host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)] size = len(host_data) - win # if host has enough flows for creating a window if size > 0: # create sliding windows sequences normal_data = get_windows(host_data, win) # get the log-likelihood of the sequential data hosts_log_likelihood[ip] = hmm.decode(normal_data)[0] else: hosts_log_likelihood[ip] = 0 # repeat procedure for all infected IPs for ip in infected: # get the flows of that host only host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)] size = len(host_data) - win # if host has enough flows for creating a window if size > 0: # create sliding windows sequences infected_data = get_windows(host_data, win) # get the log-likelihood of the sequential data hosts_log_likelihood[ip] = hmm.decode(infected_data)[0] else: hosts_log_likelihood[ip] = 0 return hosts_log_likelihood, modeled_log_likelihood
def HHM_stock(stock,startdate,enddate,predict_startdate,predict_enddate,hmmcomponents=4,cov_type='full'): from hmmlearn.hmm import GMMHMM,GaussianHMM import datetime import numpy as np import pandas as pd import warnings def get_hmm_feature(stock, startdate, enddate): df = get_price(stock, start_date=startdate, end_date=enddate, frequency='1d', fields=['close','money','volume','high','low','open'],skip_paused=True) close = df['close'] high = df['high'][5:] low = df['low'][5:] volume = df['volume'][5:] opens= df['open'][5:] datelist = pd.to_datetime(close.index[5:]) logreturn = (np.log(np.array(close[1:]))-np.log(np.array(close[:-1])))[4:] logreturn5 = np.log(np.array(close[5:]))-np.log(np.array(close[:-5])) rangereturn = (np.log(np.array(high))-np.log(np.array(low))) closeidx = close[5:] rangereturn = (np.log(np.array(high))-np.log(np.array(low))) money = df['money'] money_ma5= pd.rolling_mean(money,4) money_ma5_rate= np.log(np.array(money[5:]))-np.log(np.array(money_ma5[4:-1])) return (closeidx,datelist,np.column_stack([logreturn,rangereturn,logreturn5,money_ma5_rate])) closeidx_fit,datelist_fit,data_fit = get_hmm_feature(stock, startdate, enddate) closeidx_pred,datelist_pred,data_predict = get_hmm_feature(stock, predict_startdate, predict_enddate) warnings.filterwarnings("ignore") # diag hmm = GaussianHMM(n_components = hmmcomponents, covariance_type=cov_type,n_iter = 5000).fit(data_fit) #latent_states_sequence = hmm.predict(data_fit) hidden_state_meaning = hhm_state2read(hmm) readable_state_hidden = {meaning:state for state,meaning in hidden_state_meaning.items()} _,predict_states_sequence = hmm.decode(data_predict) predict_all_scores_sequence = hmm.predict_proba(data_predict) predict_states_score_sequence = [predict_all_scores_sequence[idx][s] for idx,s in enumerate(predict_states_sequence)] hhm_score = pd.DataFrame(predict_all_scores_sequence,columns=[hidden_state_meaning[state] for state in range(hmm.n_components)],index=datelist_pred).applymap(lambda x:round(x,5)) hhm_pred = pd.DataFrame({'close':closeidx_pred ,"state":predict_states_sequence ,'score':predict_states_score_sequence ,'action':[hidden_state_meaning[s] for s in predict_states_sequence]},index=datelist_pred) #return pd.concat([hhm_pred,hhm_score],axis=1) return (hmm,hhm_pred)
def find_optimal_params(chosen): max_ll = -math.inf optimal_win = 0 optimal_components = 0 for comp in range(2, 7): for win in range(2, 11): win_data = get_windows(chosen, win) hmm = GaussianHMM(n_components=comp) hmm.fit(win_data) log_likelihood = hmm.decode(win_data)[0] if log_likelihood > max_ll: max_ll = log_likelihood optimal_win = win optimal_components = comp # uncomment to see the log-likelihood for each configuration # print('Window=%d ,Components=%d, Log-likelihood=%.3f' % (win, comp, log_likelihood)) print('Profiling: Optimal HMM values: Window=%d, Components=%d' % (optimal_win, optimal_components)) return optimal_win, optimal_components
class HMMAnomalyDetector(AnomalyDetector): def __init__(self, n_components=4, model=None): super().__init__(model=model, abbreviation='hmm') self.n_components = n_components def fit(self, traces, trace_lens): from hmmlearn.hmm import GaussianHMM self.model = GaussianHMM(n_components=self.n_components, covariance_type="diag", n_iter=100) self.model.fit(traces, trace_lens) def predict(self, traces, trace_lens): x = np.split(traces, np.cumsum(trace_lens)[:-1]) log_probs = [] for seq in x: log_probs.append(self.model.decode(seq)[0]) return np.array(log_probs)
# print("Emission Matrix: ") # for s in hmm.states: # print("Means") # print(list(s.parameters())[0]) # print("Variance") # print(1/list(s.parameters())[1]) means = torch.stack([list(s.parameters())[0] for s in states]) means = means.detach().numpy() precs = torch.stack([list(s.parameters())[1] for s in states]) # precs = precs.detach().numpy() std = (1 / precs.sqrt()).detach().numpy() # print('std', std) y_pred, _ = hmm.decode(X) y_pred = y_pred.squeeze(1) # END FIT plt.subplot(len(datasets), 2, plot_num) if i_dataset == 0: plt.title('torchmm', size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any)
def predict_states(X,group_id,empirical_states): #print("fitting to HMM and decoding ...") max_state_number = (group_id+1)*10 n_components = 2 # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) # Train n number of HMM to avoid loacl minimal max_score = 0 max_proba_states = [] transmat = [[]] n = 2 for i in range(1,n): model.fit([X]) score = model.decode(X)[0] if i==1 or max_score < score: max_score = score max_proba_states = model.predict(X) transmat = model.transmat_ ''' print "score", score # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print hidden_states ''' # end multiple training #print max_score, max_proba_states, transmat # Compare the state with empirical states max_proba_states = max_proba_states.tolist() max_proba_states_inver = [] for s in max_proba_states: max_proba_states_inver.append(0 if s == 1 else 1) #print empirical_states, max_proba_states, max_proba_states_inver difference_state = np.subtract(np.array(max_proba_states),np.array(empirical_states)).tolist() difference_state_inver = np.subtract(np.array(max_proba_states_inver),np.array(empirical_states)).tolist() difference = np.sum(np.power(difference_state,2)) difference_inver = np.sum(np.power(difference_state_inver,2)) #print difference, difference_inver if(difference_inver < difference): max_proba_states = max_proba_states_inver # end switch bits # Predict future state future_states_proba = np.dot([0,1],transmat) future_state = 0 if future_states_proba[1] > future_states_proba[0]: future_state = 1 # End result_states = max_proba_states+[future_state for i in range(0,max_state_number-len(max_proba_states))]; return result_states print("done\n")
def build_model(data, columns): features_train = np.float32(data[columns].as_matrix()) discrete_features_train = sliding_window(features_train) model = GaussianHMM(n_components=3) model.fit(discrete_features_train) return (model, model.decode(discrete_features_train)[1])
def runHmm(patient_record, date_list, group_id, empirical_states): ############################################################################### # Processing the data max_state_number = (group_id + 1) * 10 X = np.zeros(shape=(max(len(patient_record), 2), 20)) index = 0 for date in date_list: tmp_list = [] #print(date) for key, value in patient_record[date].iteritems(): tmp_list.append(value) X[index] = np.array(tmp_list) index += 1 # if no lab test is available, train with an all zero array if X.shape[0] == 0: X = np.zeros(shape=(2, 20)) elif X.shape[0] == 1: X[1] = np.zeros(shape=(1, 20)) #print(X) #print(X.shape) ############################################################################### # Run Gaussian HMM print("fitting to HMM and decoding ...") n_components = 2 # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) # Train n number of HMM to avoid loacl minimal max_score = 0 max_proba_states = [] transmat = [[]] n = 2 for i in range(1, n): model.fit([X]) score = model.decode(X)[0] if i == 1 or max_score < score: max_score = score max_proba_states = model.predict(X) transmat = model.transmat_ ''' print "score", score # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print hidden_states ''' # end multiple training #print max_score, max_proba_states, transmat # Compare the state with empirical states max_proba_states = max_proba_states.tolist() max_proba_states_inver = [] for s in max_proba_states: max_proba_states_inver.append(0 if s == 1 else 1) #print empirical_states, max_proba_states, max_proba_states_inver difference_state = np.subtract(np.array(max_proba_states), np.array(empirical_states)).tolist() difference_state_inver = np.subtract(np.array(max_proba_states_inver), np.array(empirical_states)).tolist() difference = np.sum(np.power(difference_state, 2)) difference_inver = np.sum(np.power(difference_state_inver, 2)) #print difference, difference_inver if (difference_inver < difference): max_proba_states = max_proba_states_inver # end switch bits # Predict future state future_states_proba = np.dot([0, 1], transmat) future_state = 0 if future_states_proba[1] > future_states_proba[0]: future_state = 1 # End result_states = max_proba_states + [ future_state for i in range(0, max_state_number - len(max_proba_states)) ] return result_states ''' state = [0,1] transmat = np.array(model.transmat_) print np.dot(state,transmat) print np.array(model.transmat_) #print (hidden_states) #print (hidden_states.shape) ''' print("done\n")
def predict_states(X, group_id, empirical_states): #print("fitting to HMM and decoding ...") max_state_number = (group_id + 1) * 10 n_components = 2 # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) # Train n number of HMM to avoid loacl minimal max_score = 0 max_proba_states = [] transmat = [[]] n = 2 for i in range(1, n): model.fit([X]) score = model.decode(X)[0] if i == 1 or max_score < score: max_score = score max_proba_states = model.predict(X) transmat = model.transmat_ ''' print "score", score # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print hidden_states ''' # end multiple training #print max_score, max_proba_states, transmat # Compare the state with empirical states max_proba_states = max_proba_states.tolist() max_proba_states_inver = [] for s in max_proba_states: max_proba_states_inver.append(0 if s == 1 else 1) #print empirical_states, max_proba_states, max_proba_states_inver difference_state = np.subtract(np.array(max_proba_states), np.array(empirical_states)).tolist() difference_state_inver = np.subtract(np.array(max_proba_states_inver), np.array(empirical_states)).tolist() difference = np.sum(np.power(difference_state, 2)) difference_inver = np.sum(np.power(difference_state_inver, 2)) #print difference, difference_inver if (difference_inver < difference): max_proba_states = max_proba_states_inver # end switch bits # Predict future state future_states_proba = np.dot([0, 1], transmat) future_state = 0 if future_states_proba[1] > future_states_proba[0]: future_state = 1 # End result_states = max_proba_states + [ future_state for i in range(0, max_state_number - len(max_proba_states)) ] return result_states print("done\n")
def runHmm(patient_record,date_list,group_id,empirical_states): ############################################################################### # Processing the data max_state_number = (group_id+1)*10 X = np.zeros(shape=(max(len(patient_record),2),20)) index = 0 for date in date_list: tmp_list = [] #print(date) for key, value in patient_record[date].iteritems(): tmp_list.append(value) X[index] = np.array(tmp_list) index+=1 # if no lab test is available, train with an all zero array if X.shape[0] == 0: X = np.zeros(shape=(2,20)) elif X.shape[0] == 1: X[1] = np.zeros(shape=(1,20)) #print(X) #print(X.shape) ############################################################################### # Run Gaussian HMM print("fitting to HMM and decoding ...") n_components = 2 # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) # Train n number of HMM to avoid loacl minimal max_score = 0 max_proba_states = [] transmat = [[]] n = 2 for i in range(1,n): model.fit([X]) score = model.decode(X)[0] if i==1 or max_score < score: max_score = score max_proba_states = model.predict(X) transmat = model.transmat_ ''' print "score", score # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print hidden_states ''' # end multiple training #print max_score, max_proba_states, transmat # Compare the state with empirical states max_proba_states = max_proba_states.tolist() max_proba_states_inver = [] for s in max_proba_states: max_proba_states_inver.append(0 if s == 1 else 1) #print empirical_states, max_proba_states, max_proba_states_inver difference_state = np.subtract(np.array(max_proba_states),np.array(empirical_states)).tolist() difference_state_inver = np.subtract(np.array(max_proba_states_inver),np.array(empirical_states)).tolist() difference = np.sum(np.power(difference_state,2)) difference_inver = np.sum(np.power(difference_state_inver,2)) #print difference, difference_inver if(difference_inver < difference): max_proba_states = max_proba_states_inver # end switch bits # Predict future state future_states_proba = np.dot([0,1],transmat) future_state = 0 if future_states_proba[1] > future_states_proba[0]: future_state = 1 # End result_states = max_proba_states+[future_state for i in range(0,max_state_number-len(max_proba_states))]; return result_states ''' state = [0,1] transmat = np.array(model.transmat_) print np.dot(state,transmat) print np.array(model.transmat_) #print (hidden_states) #print (hidden_states.shape) ''' print("done\n")
#X1 = [[0.5], [1.0], [-1.0], [0.42], [0.24]] #X2 = [[0.5], [1.0], [-1.0], [0.42], [0.24]] #X = np.concatenate([X1, X2]) #lengths = [len(X1), len(X2)] modelFor0 = GaussianHMM(n_components=2, n_iter=100).fit(streamTrain0, lengthsTrain0) #modelFor1 = GaussianHMM(n_components=16, n_iter=100).fit(streamTrain1, lengthsTrain1) # modelFor0 = GaussianHMM(n_components=2, n_iter=200).fit(trainingDataSet0[0]) predictTraining = np.zeros(shape=(trainingDataSet.shape[0], 1)) results0 = np.zeros(shape=(trainingDataSet.shape[0], 1)) # results1 = np.zeros(shape=(trainingDataSet.shape[0],1)) for i in range(0, trainingDataSet.shape[0]): predict0Results0 = modelFor0.decode(trainingDataSet[i].reshape( trainingDataSet[i].shape[0], 1), algorithm='viterbi')[0] #predict0Results1 = modelFor1.score(trainingDataSet[i].reshape(trainingDataSet[i].shape[0], 1)) results0[i] = predict0Results0 # if predict0Results0 > predict0Results1: # results0[i] = 0 # else: # results0[i] = 1 # print(predict0[63]) shapeTra = trainingDataSet.shape[0] shapeTra0 = trainingDataSet0[0].shape[0] shapeTra1 = trainingDataSet1[0].shape[0] modelFor1 = GaussianHMM(n_components=2,
def MyGaussianHMM(): from hmmlearn.hmm import GaussianHMM df = pd.read_csv( "/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv", header=-1) df.head() X = np.array(df.iloc[:, 0:5]) # 一、未知模型情况下,解决问题3 model = GaussianHMM(n_components=6, covariance_type="diag", n_iter=1000) # 方差矩阵为对角阵 """ 参数解释: covariance_type: "spherical" :主对角元素均为1,其余元素为0,独立同分布 (数据不足时,难以进行参数估计) "diag" :主对角元素不为0,其余为0 (一般情况,折中) "full" :所有元素均不为0 (数据足够进行参数估计时) """ model.fit(X) print "隐含状态为: ", model.predict(X) # 列出每一天的隐含状态 print "特征数目 %s" % model.n_features print "隐状态数目 %s" % model.n_components print "起始概率 :", model.startprob_ print "隐状态转移矩阵", model.transmat_ ## 每个隐含层对应的特征概率空间假设为正态分布,则可以得到一个model.n_components行model.n_features列的均值矩阵 print "混淆矩阵:均值部分", model.means_ print "混淆矩阵:方差部分", model.covars_ ## 绘图 hidden_states = model.predict(X) tradeDate = df.iloc[:, 5].values closeIndex = df.iloc[:, 6].values plt.figure(figsize=(15, 8)) for i in range(model.n_components): idx = (hidden_states == i) plt.plot_date(pd.to_datetime(tradeDate[idx]), closeIndex[idx], '.', label='%dth hidden state' % i, lw=1) plt.legend() plt.grid(1) plt.show() # 二、已知模型情况下,解决问题1,2 ## 沿用上述模型 ### 问题1 print "某天出现该观测的概率为: %s" % np.exp(model.score(X[0])) ### 问题2 log_prob, state = model.decode(X[:10], algorithm="viterbi") print "只根据前十天,推断出最有可能的隐含状态序列为:", state ## 自己输入模型参数 ### 一个2特征,4隐状态情况 startprob = np.array([0.6, 0.3, 0.1, 0.0]) # The transition matrix, note that there are no transitions possible # between component 1 and 3 transmat = np.array([[0.7, 0.2, 0.0, 0.1], [0.3, 0.5, 0.2, 0.0], [0.0, 0.3, 0.5, 0.2], [0.2, 0.0, 0.2, 0.6]]) # The means of each component means = np.array([[0.0, 0.0], [0.0, 11.0], [9.0, 10.0], [11.0, -1.0]]) # The covariance of each component covars = .5 * np.tile(np.identity(2), (4, 1, 1)) model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000) model2.startprob_ = startprob model2.transmat_ = transmat model2.means_ = means model2.covars_ = covars