def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float("-inf") best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: other_words_score = 0.0 quantity = 0.0 model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) this_word_score = model.score(self.X, self.lengths) for word in self.hwords: if word != self.this_word: quantity += 1 X, lengths = self.hwords[word] other_words_score += model.score(X, lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 score = this_word_score - other_words_score / quantity if score > best_score: best_score = score best_model = model except: continue return best_model
def mainHMM(filePrefix): X_train, length_train, X_test, length_test = loadOneRoute(filePrefix) # Run Gaussian HMM print "fitting to HMM and decoding ..." model = GaussianHMM(n_components=4, covariance_type="diag", n_iter=2000).fit(X_train[:, 0:5], length_train) hidden_states = model.predict(X_test[:, 0:5], length_test) print "done" print hidden_states[0:20] print hidden_states[20:40] print hidden_states[40:60] print hidden_states[60:80] # Print trained parameters and plot print("Transition matrix") print(model.transmat_) print("Start Prob") print(model.startprob_) print("Means and vars of each hidden state") for i in range(model.n_components): print("{0}th hidden state".format(i)) print("mean = ", model.means_[i]) print("var = ", np.diag(model.covars_[i])) print np.array(hidden_states).reshape((sum(length_test), 1))
def addModel(self, nom, data, nbEtats, n_iter, startprob_prior=None, transmat_prior=None): ''' ajoute un model à tabModels paramètres : nom = nom du modèle data = tableau à trois dimension représentant un cluster possèdant des mouvements possèdant lui même des positions nbEtats = nombre d'états cachés pour chaque modèle n_iter = nombre d'itérations pour l'algorithme de Baum-Welch startprob_prior = la matrice initiale à priori transmat_prior = la matrice de transition à priori des états ''' model = GaussianHMM(nbEtats, covariance_type="diag", n_iter=n_iter, startprob_prior=startprob_prior, transmat_prior=transmat_prior) model.fit(data) verif_set_transMat(model) taille = len(self.tabModels) if(taille == 0): self.tabModels.append([nom]) self.tabModels[0].append(model) return for i in range(taille): if(self.tabModels[i][0] == nom): self.tabModels[i].append(model) return self.tabModels.append([nom]) self.tabModels[-1].append(model)
def fit_batch(traj_data, n_components=2, subsample_factor=1, features=['speed', 'rotation'], **kwargs): ''' Fits model to concatenated traj_data Args: traj_data - list of paths of training dataset (trajectory csv) n_components - number of hidden states subsample_factor - subsample factor to apply to all files features - columns to fit model to **kwargs passed to GaussianHMM Returns: model - fitted model ''' # Concatenate data feature_list = [] lengths_list = [] for path in traj_data: X, l = features_from_csv(path, features=features, subsample_factor=subsample_factor) feature_list.append(X) lengths_list.append(l) print 'Concatenating features...' X = np.vstack(feature_list) l = np.hstack(lengths_list) # Fit HMM print 'Fitting model...' model = GaussianHMM(n_components, **kwargs) model.fit(X, lengths=l) return model
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on BIC scores # raise NotImplementedError record = float("inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # p is the number of free parameters, N is the number of data points p = num*num + 2* num* len(self.X[0]) -1 BIC = -2* logL + p * np.log(len(self.X)) if BIC < record: record = BIC hmm_model = model except: continue # print("failure on {} with {} states".format(self.this_word, num)) return hmm_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on DIC scores # raise NotImplementedError record = float("-inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) tmp = 0 for word in self.hwords: X, lengths = self.hwords[word] tmp += model.score(X,lengths) DIC = logL - (tmp-logL) /(len(self.hwords)-1) if DIC > record: record = DIC hmm_model = model except: continue # print("failure on {} with {} states".format(self.this_word, num)) return hmm_model
def fit(self): if self.verbose: print "[Clustering] Clearing old model and segmentation" self.segmentation = [] self.model = [] new_segments = [] new_model = [] g = GaussianHMM(n_components=self.n_components) all_demos = self._demonstrations[0] lens = [np.shape(self._demonstrations[0])[0]] for i in range(1, len(self._demonstrations)): all_demos = np.concatenate([all_demos,self._demonstrations[i]]) lens.append(np.shape(self._demonstrations[i])[0]) g.fit(all_demos,lens) for d in self._demonstrations: new_segments.append(self.findTransitions(g.predict(d))) #print g.predict(d) new_model.append(g) self.segmentation = new_segments self.model = new_model
def fit_HMM(self,error_metric): print "Looking for optimal number of states and fitting HMM" for i in xrange(2,5): candidate = GaussianHMM(n_components=i, covariance_type="full", n_iter=1000) candidate.fit(self.X_train) if error_metric == HMM_MAD: error = HMM_MAD(candidate,self.X_test) if i == 2: best_guess = error best_model = candidate opt_n_states = i else: if error < best_guess: opt_n_states = i best_model = candidate best_guess = error else: error = error_metric(candidate,self.X_test) if i == 2: best_guess = error best_model = candidate opt_n_states = i else: if error > best_guess: opt_n_states = i best_model = candidate best_guess = error self.model = best_model self.n_states = opt_n_states print "Done. Lowest error of {} achieved with {} states".format(best_guess, opt_n_states)
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('inf') best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 p = n ** 2 + 2 * n * len(self.X[0]) - 1 N = len(self.X) score = -2 * logL + p * np.log(N) if score < best_score: best_score = score best_model = model except: continue return best_model
def main(args): x, X = loadDiffRows(args.diffFile) model = GaussianHMM(n_components=3, covariance_type="diag", n_iter=100000000000) model.transmat_ = numpy.array([[0.5, 0.5, 0.0], [0.0, 0.5, 0.5], [0.0, 0.0, 1.0]]) model.fit(X) print(model.transmat_) model.transmat_[0][2] = 0. model.transmat_[1][0] = 0. model.transmat_[2][0] = 0. model.transmat_[2][1] = 0. exp = args.outFile.split('/')[-1].split('_')[0] with open(args.outFile, 'w') as fout: print('exp\tbin\treads\tstate', file=fout) for seq in X: hiddenStates = model.predict(seq) for idx,v in enumerate(zip(x,hiddenStates)): r,h = v print(exp + '\t' + str(idx) + '\t' + str(r) + '\t' + str(h), file=fout)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('-inf') best_model = None sum_score = 0.0 counter = 0.0 if len(self.sequences) >= 3: n_splits = min(3, len(self.sequences)) splits = KFold(n_splits) for n in range(self.min_n_components, self.max_n_components+1): try: for train_index, test_index in splits.split(self.sequences): # used forum code to get train/test X,Lengths respectively: https://discussions.udacity.com/t/selectorcv-crashes/400125 train_X, train_lengths = combine_sequences(train_index, self.sequences) test_X, test_lengths = combine_sequences(test_index, self.sequences) model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_lengths) score = model.score(test_X, test_lengths) sum_score += score counter += 1 # used average score from udacity forum: https://discussions.udacity.com/t/my-selectorcv-class/349110 average_score = sum_score / counter if average_score > best_score: best_score = average_score best_model = model except: continue # for models with length less than 3 else: best_score_1 = float('inf') best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 p = n ** 2 + 2 * n * len(self.X[0]) - 1 N = len(self.X) score_1 = -2 * logL + p * np.log(N) if score_1 < best_score_1: best_score_1 = score_1 best_model = model except: continue return best_model
def fit_hmm(df, n_components, features=['speed', 'rotation'], **kwargs): ''' Fits a Gaussian HMM to the velocity data Args: df - dataframe containing positional data to be processed n_components - number of hidden states features - features to use in model fitting **kwargs passed to GaussianHMM Returns: model ''' X, lengths = get_features(df, features=features) model = GaussianHMM(n_components, **kwargs) model.fit(X, lengths=lengths) return model
def setup(): def load_patterns(file): patterns = None sizes = np.zeros(len(words)) counter = 0 f = open(file, 'rb') data = f.readlines() stack = [] for i in range(np.shape(data)[0]): data2 = map(float, data[i].split()) data2 = np.reshape(data2, (1, -1)) if i == 0: stack = data2 else: stack = np.vstack((stack, data2)) f.close() sizes[counter] = np.shape(stack)[0] counter += 1 if patterns is None: patterns = stack else: patterns = np.vstack((patterns, stack)) return patterns hidden = 1 self.go_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('go.bin')) self.back_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('back.bin')) self.right_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('right.bin')) self.left_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('left.bin')) self.stop_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('stop.bin'))
def train(self, data, n_components): print("Training Data: %s" % data) self.data = data self.model = GaussianHMM(n_components, algorithm='viterbi', covariance_type='diag') X = np.reshape(data, (len(data),1)) self.model = self.model.fit([X]) self.hidden_states = self.model.predict(X) print("Sequence of States: " % self.hidden_states)
class HMM: __slots__ = [ "model" ] def __init__(self): pass def draw(self, data): figure() plot(range(len(data)),data,alpha=0.8,color='red') show() def train(self, data, n_components): print("Training Data: %s" % data) self.data = data self.model = GaussianHMM(n_components, algorithm='viterbi', covariance_type='diag') X = np.reshape(data, (len(data),1)) self.model = self.model.fit([X]) self.hidden_states = self.model.predict(X) print("Sequence of States: " % self.hidden_states) def eval(self, obs): print("Testing Data: %s" % obs) X = np.reshape(obs, (len(obs),1)) print("Eval: %s" % str(self.model.score(X))) def plot(self): fig = figure(facecolor="white") ax = fig.add_subplot(111) for i in range(self.model.n_components): # use fancy indexing to plot data in each state idx = (self.hidden_states == i) ax.plot(np.array(range(len(self.data)))[idx], np.array(self.data)[idx], '.', label="State %d" % (i+1)) ax.legend() show()
def test_backward_with_hmmlearn(self): r = np.random.randn obs = [np.array([[-600 + r(), 100 + r()], [-300 + r(), 200 + r()], [0 + r(), 300 + r()]]) for _ in xrange(10)] hmm = GaussianHMM(n_components=3) hmm.fit(obs) # Calculcate bwdlattice using hmmlearn algorithm framelogprob = hmm._compute_log_likelihood(obs[0]) start = timeit.default_timer() bwdlattice1 = hmm._do_backward_pass(framelogprob) print('hmmlearn took %fs' % (timeit.default_timer() - start)) # Calculate bwdlattice using fhmm algorithm with #chains = 1. This should yield the exact same results start = timeit.default_timer() bwdlattice2 = np.zeros(bwdlattice1.shape) fhmmc._backward(obs[0].shape[0], 1, hmm.n_components, [(x,) for x in xrange(hmm.n_components)], hmm._log_startprob.reshape(1, 3), hmm._log_transmat.reshape(1, 3, 3), framelogprob, bwdlattice2) print('fhmm took %fs' % (timeit.default_timer() - start)) self.assertTrue(np.allclose(bwdlattice1, bwdlattice2))
class HmmClassifier(): def __init__(self, referenceSeqs, inputSeq): self.referenceSeqs = referenceSeqs self.inputSeq = inputSeq # feel free to change this model self.model = GaussianHMM(n_components=2, covariance_type="full", n_iter=2000) def predict(self): probs = [] for referenceSeq in self.referenceSeqs: #print "reference: {}".format(referenceSeq) self.model.fit(referenceSeq) hidden_states = self.model.predict(referenceSeq) prob = self.model.score(self.inputSeq) probs.append(prob) # return the index of the max prob return probs.index(max(probs))
def calculate_hmm_g(training_set, test_set, taxonomy, cursor, connection, settings): da_id_taxonomy = find_da_id(taxonomy, cursor) states, start_probability, transition_probability = start_transition_probability_extraction(training_set, taxonomy) n_states = len(states) feature_list = extract_features_training_set_gaus(training_set, taxonomy, settings) n_features = len(feature_list[states[0]][0]) mean = calculate_means(states, feature_list, n_features) covariance = calculate_covariance(states, feature_list, n_features) # covariance = diag_cov(states, feature_list, n_features, mean) model = GaussianHMM(n_components=n_states, covariance_type='full') model.startprob_ = start_probability model.transmat_ = transition_probability model.means_ = mean model.covars_ = covariance test_seq, con_pathes = extract_features_test_set_gaus(test_set, taxonomy, settings) da_predictions(test_seq, model, con_pathes, states, da_id_taxonomy, taxonomy, cursor, connection)
def __init__(self, n_components=1, covariance_type='diag', min_covar=1e-3, startprob_prior=1.0, transmat_prior=1.0, means_prior=0, means_weight=0, covars_prior=1e-2, covars_weight=1, algorithm="viterbi", random_state=None, n_iter=5, tol=1e-2, verbose=False, params="stmc", init_params="stmc", states_prior=None, fp_state=None): GaussianHMM.__init__(self, n_components=n_components, covariance_type=covariance_type, min_covar=min_covar, startprob_prior=startprob_prior, transmat_prior=transmat_prior, means_prior=means_prior, means_weight=means_weight, covars_prior=covars_prior, covars_weight=covars_weight, algorithm=algorithm, random_state=random_state, n_iter=n_iter, tol=tol, verbose=verbose, params=params, init_params=init_params) self.covariance_type = covariance_type self.min_covar = min_covar self.means_prior = means_prior self.means_weight = means_weight self.covars_prior = covars_prior self.covars_weight = covars_weight self.states_prior = states_prior self.fp_state = fp_state
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV # raise NotImplementedError record = float("-inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) if len(self.sequences) == 1: return hmm_model elif len(self.sequences) == 2: split_method = KFold(n_splits=2) #self.max_n_components = 3 else: split_method = KFold(n_splits=3,random_state=self.random_state) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) logL = 0 cnt = 0 for cv_train_idx, cv_test_idx in split_method.split(self.sequences): #print("Train fold indices:{} Test fold indices:{}".format(cv_train_idx, cv_test_idx)) # view indices of the folds X, lengths = combine_sequences(cv_train_idx,self.sequences) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(X, lengths) X, lengths = combine_sequences(cv_test_idx,self.sequences) logL += model.score(X, lengths) except: continue #print("failure on {} with {} states".format(self.this_word, num)) if cnt> 0 and logL/cnt > record: record = logL hmm_model = model return hmm_model
def calculate_weights(self, date, amount): if self.stacked == False: for elements in self.tradingDates: if elements.get('dt') >= self.start_date and elements.get('dt') <= date : self.trainingDates.append(elements['dt']) for assetCode in self.asset_codes: assetValues = [] # for each_date in self.trainingDates: # assetValues.append(StockData.objects.filter(dt=each_date,ticker=assetCode).values("price_close")[0]['price_close']) assetValues = [StockData.objects.filter(dt=each_date,ticker=assetCode).values("price_close")[0]['price_close'] for each_date in self.trainingDates] self.historical_Data[assetCode] = assetValues self.stacked = True else: assetValues = [] for assetCode in self.asset_codes: self.historical_Data[assetCode].append(StockData.objects.filter(dt=date,ticker=assetCode).values("price_close")[0]['price_close']) target = {'money': amount} for assetCode in self.asset_codes: close_v = np.array(self.historical_Data[assetCode]) diff = np.diff(close_v) X = np.column_stack([diff]) model = GaussianHMM(n_components=2, covariance_type="diag", n_iter=1000).fit(X) hidden_states = model.predict(X) stableProb = 0 if hidden_states[len(hidden_states) - 1] == 1: stableProb = model.transmat_[1][1] else: stableProb = 0 target[assetCode] = stableProb target['money'] -= stableProb * close_v[len(close_v) - 1] self.weight = [] self.weight.append(target['money']) # for assetCode in self.asset_codes: # self.weight.append(target[assetCode]) self.weight += [target[assetCode] for assetCode in self.asset_codes] return self.weight
def hmmtest(trade_data, test_data): # pack diff and volume for training # delete record containng infinity X = test_data[test_data['Strategy_Gross_Return_RDP_5'] != float("inf")] X = test_data ############################################################################### # Run Gaussian HMM #print("fitting to HMM and decoding ...", end='') n_components = 4 covariance_type = 'full' n_iter = 1000 # make an HMM instance and execute fit model = GaussianHMM(n_components=n_components, covariance_type=covariance_type, n_iter=n_iter).fit(X) #model= GMMHMM(n_components=4,n_mix=3,covariance_type="diag", n_iter=100).fit(X) # model = MultinomialHMM(n_components=4, n_iter=100).fit(X) # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) #print("done\n") ############################################################################### # print trained parameters and plot #print("Transition matrix") #print(model.transmat_) #print() print("means and vars of each hidden state") for i in range(model.n_components): print("%dth hidden state" % i) print("mean = ", model.means_[i]) print("var = ", np.diag(model.covars_[i])) plotHmmState(model, hidden_states, trade_data) return model
def bench_gaussian_hmm(size): title = "benchmarking Gaussian HMM on a sample of size {0}".format(size) print(title.center(36, " ")) ghmm = GaussianHMM() ghmm.means_ = [[42], [24]] ghmm.covars_ = [[1], [1]] with timed_step("generating sample"): sample, _states = ghmm.sample(size) with timed_step("fitting"): fit = GaussianHMM(n_components=2).fit([sample]) with timed_step("estimating states"): fit.predict(sample)
def __create_model__(self, num_states, measurements, measurement_lengths): # with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) try: hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_seed, verbose=False)\ .fit(measurements, measurement_lengths) if self.verbose: print("model created for {} with {} states".format(self.this_word, num_states)) return hmm_model except Exception as e: if self.verbose: traceback.print_exc() print("failure on {} with {} states".format(self.this_word, num_states)) raise
def base_model(self, num_states): # with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=RuntimeWarning) try: hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) if self.verbose: print("model created for {} with {} states".format( self.this_word, num_states)) return hmm_model except: if self.verbose: print("failure on {} with {} states".format( self.this_word, num_states)) return None
def cmodel(company, dt1, dt2, num_of_states): quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) #Here we set the time range # Unpack the quotes ! dates = np.array([q[0] for q in quotes], dtype=int) close_v = np.array([q[2] for q in quotes]) # Take diff of close value and shift by 1 diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] X = np.column_stack([diff]) # Create HMM instance and fit model = GaussianHMM(n_components=num_of_states, covariance_type="full", n_iter=1000).fit(X) #print ("Model Covars: ", model.covars_) expected_days = 1 tr_mls = 1 if (num_of_states > 1): #Identify the most likely last hidden state try: hidden_probs = model.predict_proba(X) except: model = GaussianHMM(n_components=num_of_states, covariance_type="diag", n_iter=1000).fit(X) hidden_probs = model.predict_proba(X) lstate_prob = hidden_probs[-1] mls = lstate_prob.argmax() # self transition probability for the most likely last hidden state tr_mls = model.transmat_[mls][mls] # we make use of the geometric series formula to calculate the number # of days expected to stay at the current state expected_days = (1.0 / (1 - tr_mls)) # we save the model for future use fname = str(company)+"_"+str(num_of_states)+"_states_model_final.pkl" joblib.dump(model, os.path.join('./sims_final', fname)) #return expected days return expected_days, tr_mls
def averageLL(self, n): # Set best model to nonexistant best_mod = None bestLogL = float("-inf") # Split sequences (default to 3 unless self.sequences length is too short) if len(self.sequences) < 2: try: mod_n = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) logL = mod_n.score(self.X, self.lengths) except: logL = float("-inf") mod_n = None return logL, mod_n n_splits = min(3, len(self.sequences)) split_method = KFold(n_splits=n_splits) for cv_train_idx, cv_test_idx in split_method.split(self.sequences): # for a given split of test/train data, train model and test log loss Xtrain, lengths_train = combine_sequences(cv_train_idx, self.sequences) Xtest, lengths_test = combine_sequences(cv_test_idx, self.sequences) try: mod_n = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(Xtrain, lengths_train) logL = mod_n.score(Xtest, lengths_test) # If this model is better than previous best, set this as the new best if logL > bestLogL: bestLogL = logL best_mod = mod_n # in case model create or score throws exception, ignore this model and test next one except: logL = float("-inf") mod_n = None return bestLogL, best_mod
def fit_and_apply_hmm(normal, infected, chosen, data): # define sliding window size and number of components win, components = 4, 5 # uncomment the next line to find the optimal window size and number of components # it takes some time though... # win, components = find_optimal_params(chosen) win_data = get_windows(chosen, win) # learn a Gaussian Hidden Markov Model with 4 states from the infected host data hmm = GaussianHMM(n_components=components) hmm.fit(win_data) # store the log-likelihood of the host that trained the model modeled_log_likelihood = hmm.decode(win_data)[0] hosts_log_likelihood = {} # compute log-likelihood of data sequence of normal IPs for ip in normal: # get the flows of that host only host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)] size = len(host_data) - win # if host has enough flows for creating a window if size > 0: # create sliding windows sequences normal_data = get_windows(host_data, win) # get the log-likelihood of the sequential data hosts_log_likelihood[ip] = hmm.decode(normal_data)[0] else: hosts_log_likelihood[ip] = 0 # repeat procedure for all infected IPs for ip in infected: # get the flows of that host only host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)] size = len(host_data) - win # if host has enough flows for creating a window if size > 0: # create sliding windows sequences infected_data = get_windows(host_data, win) # get the log-likelihood of the sequential data hosts_log_likelihood[ip] = hmm.decode(infected_data)[0] else: hosts_log_likelihood[ip] = 0 return hosts_log_likelihood, modeled_log_likelihood
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score, best_n_components = None, None for n_components in range(self.min_n_components, self.max_n_components + 1): scores, n_splits = [], 3 if (len(self.sequences) < 3): try: model = GaussianHMM(n_components=n_components, n_iter=1000).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) if (best_score is None or logL > best_score): best_score, best_n_components = logL, n_components except Exception as e: # Skip cross-validation for current n_components continue else: split_method = KFold(random_state=self.random_state, n_splits=n_splits) for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) try: model = GaussianHMM(n_components=n_components, n_iter=1000).fit( X_train, lengths_train) logL = model.score(X_test, lengths_test) scores.append(logL) except Exception as e: break training_successful = len(scores) == n_splits if (not training_successful): continue avg = np.average(scores) if (best_score is None or avg > best_score): best_score, best_n_components = avg, n_components if (best_score == None): best_n_components = 3 return self.base_model(best_n_components)
def get_model(self): self.pipe_pca = make_pipeline( StandardScaler(), PrincipalComponentAnalysis(n_components=3), GaussianHMM(n_components=3, covariance_type='full', random_state=7)) self.pipe_pca.fit(self.train[['return'] + self.features]) model = self.pipe_pca.steps[2][1] results = [] for i in range(3): result = [i, model.means_[i][0], np.diag(model.covars_[i])[0]] results.append(result) results = pd.DataFrame(results) results.columns = ['state', 'train_mean', 'train_var'] self.results = results.set_index('state') self.get_renamed_states()
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('-inf') if len(self.sequences) == 1: isSplit = False else: isSplit = True nFolds = min(3, len(self.sequences)) split_method = KFold(n_splits=nFolds) for nStates in list( range(self.max_n_components, self.max_n_components + 1)): try: cv_score = 0 if isSplit: for train_idx, test_idx in split_method.split( self.sequences): train_X, train_lengths = combine_sequences( train_idx, self.sequences) test_X, test_lengths = combine_sequences( test_idx, self.sequences) cv_model = GaussianHMM(n_components=nStates, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_X, train_lengths) cv_score += cv_model.score(test_X, test_lengths) avg_score = cv_score / nFolds else: cv_model = GaussianHMM(n_components=nStates, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) avg_score = cv_model.score(self.X, self.lengths) if avg_score > best_score: best_score = avg_score best_nStates = nStates logging.debug( "CV better score for {} with {} states".format( self.this_word, nStates)) except ValueError: logging.debug("CV ValueError on {} with {} states".format( self.this_word, nStates)) return self.base_model(nStates) return self.base_model(best_nStates)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on DIC scores max_dic_score = float("-inf") best_model = None for n in range(self.min_n_components, self.max_n_components + 1): anti_probabilities = [] try: model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) log_l = model.score(self.X, self.lengths) except: continue for word in self.words: if word is not self.this_word: try: anti_model = GaussianHMM( n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False) x, lengths = self.hwords[word] anti_model.fit(x, lengths) anti_probabilities.append(anti_model.score(x, lengths)) except: continue dic_score = log_l - np.mean(anti_probabilities) if dic_score > max_dic_score: max_dic_score = dic_score best_model = self.base_model(n) return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = None best_score = float("-inf") for n in range(self.min_n_components, self.max_n_components + 1): if len(self.sequences) > 2: split_method = KFold(n_splits=min(3, len(self.sequences))) for train_index, test_index in split_method.split( self.sequences): logL = [] X_train, train_length = combine_sequences( train_index, self.sequences) X_test, test_length = combine_sequences( test_index, self.sequences) try: hmm_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, train_length) logL.append(_model.score(X_test, test_length)) except: pass if np.mean(logL) > best_score: best_score = np.mean(logL) best_model = hmm_model if not best_model: best_model = self.base_model(self.n_constant) return best_model
def UpdateHMM(beginDate, endDate): #beginDate = '20110401' #endDate = '20140401' data = DataAPI.MktIdxdGet(ticker='000001', beginDate=beginDate, endDate=endDate, field=[ 'tradeDate', 'closeIndex', 'lowestIndex', 'highestIndex', 'turnoverVol' ], pandas="1") #1指数日行情数据 data1 = DataAPI.FstTotalGet(exchangeCD=u"XSHE", beginDate=beginDate, endDate=endDate, field=['tradeVal'], pandas="1") #1获取深圳交易所的融资融券数据 data2 = DataAPI.FstTotalGet(exchangeCD=u"XSHG", beginDate=beginDate, endDate=endDate, field=['tradeVal'], pandas="1") tradeVal = data1 + data2 #1融资融券数据总和 tradeDate = pd.to_datetime(data['tradeDate'][5:]) #日期列表 volume = data['turnoverVol'][5:] #2 成交量数据 closeIndex = data['closeIndex'] # 3 收盘价数据 deltaIndex = np.log(np.array(data['highestIndex'])) - np.log( np.array(data['lowestIndex'])) #3 当日对数高低价差 deltaIndex = deltaIndex[5:] logReturn1 = np.array(np.diff(np.log(closeIndex))) #4 对数收益率 logReturn1 = logReturn1[4:] logReturn5 = np.log(np.array(closeIndex[5:])) - np.log( np.array(closeIndex[:-5])) # 5日 对数收益差 logReturnFst = np.array(np.diff(np.log(tradeVal['tradeVal'])))[4:] closeIndex = closeIndex[5:] X = np.column_stack( [logReturn1, logReturn5, deltaIndex, volume, logReturnFst]) # 将几个array合成一个2Darray # Make an HMM instance and execute fit model = GaussianHMM(n_components=3, covariance_type="diag", n_iter=800).fit([X]) return model
def __init__(self, feature_metadata, minfraction, scale, kernelfn, tau=None, sigma=None, ncomponents=1, n_iter=1): super(CasmlApproximator, self).__init__() self._minfraction = minfraction self._scale = scale self._kernelfn = kernelfn self._new_sequence = True #: Contains all the existing CasmlAppoximations created by #: this CasmlApproximator. The keys serve as both queries and #: bases (queries are a superset of bases), so a datum may be #: None if the associated key is just a basis, not a query. self._queries = weakref.WeakValueDictionary() """:type: dict[tuple[MDPState, MDPAction], Approximation]""" #: The subset of keys of queries that are also bases. #: The order in which the bases have been received is preserved self._bases = set() """:type: set[tuple[MDPState, Hashable]""" self._fit_X = [] """:type: list[ndarray]""" #: The case base maintaining the observations in the form #: c = <s, a, ds>, where ds = s_{i+1} - s_i #: to identify possible successor states. self._basiscb = CaseBase(feature_metadata, retention_method=self._RetentionMethod, retention_method_params=(tau, sigma), name='basiscb') """:type: CaseBase""" del feature_metadata['delta_state'] #: Invariant: contains all the keys in queries self._querycb = CaseBase(feature_metadata, name='querycb') """:type: CaseBase""" #: The hidden Markov model maintaining the observations in the form #: seq = <s_{i}, s_{i+1}> #: to reason on the transition probabilities of successor states. self._hmm = GaussianHMM(ncomponents, n_iter=n_iter) # , covariance_type='full' # self._hmm = GaussianHMM(ncomponents) """:type: GaussianHMM""" self._not_add_bases = 0 self._not_add_count = 0
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on DIC scores component_model_scores = pd.DataFrame({'components_num': [], 'mean_log_likelihood': []}) i = 0 for num_states in range(self.min_n_components,self.max_n_components+1): try: hmm_model = self.base_model(num_states) logP = hmm_model.score(self.X, self.lengths) except: continue # DIC = 0 if hmm_model is not None: sumlogP = 0 M = 0 for word in self.hwords: if word == self.this_word: continue try: other_X, other_lengths = self.hwords[word] # Increase M even if the model can't score for a word as we need to penalize the model # in that case logTemp = hmm_model.score(other_X,other_lengths) sumlogP += logTemp M += 1 except: continue DIC = logP - sumlogP i += 1 component_model_scores.loc[i] = [num_states, DIC] # Best model parameters try: best_num_states = \ component_model_scores.ix[component_model_scores[['mean_log_likelihood']].idxmax()]['components_num'] best_num_states = int(best_num_states) best_hmm_model = GaussianHMM(n_components=best_num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state,verbose=False).fit(self.X, self.lengths) return best_hmm_model except: return None
def select(self): # Use these variables to store best model bestDIC = None bestModel = None # Iterate over all possible models for num_states in range(self.min_n_components, self.max_n_components + 1): try: # Create new Gaussian HMM hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=self.verbose) if self.verbose: print("model created for {} with {} states".format( self.this_word, num_states)) # Fit model with current data hmm_model.fit(self.X, self.lengths) # Calculate logL logL = hmm_model.score(self.X, self.lengths) otherScores = 0 # Calculate likelihood SUM for all other words for otherWord in self.hwords: if otherWord != self.this_word: otherScores += hmm_model.score(*self.hwords[otherWord]) # Caluclate dicusing formula DIC = log(P(X(i)) - 1/(M-1)SUM(log(P(X(all but i)) dic = logL - (float(1) / (len(self.hwords) - 1)) * otherScores # Find model with highest DIC if bestDIC is None or dic > bestDIC: bestModel = hmm_model bestDIC = dic except: if self.verbose: print("failure on {} with {} states".format( self.this_word, num_states)) return bestModel
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = GaussianHMM() best_score = float("inf") for n_components in range(self.min_n_components, self.max_n_components + 1): try: BIC, model = self.score(n_components) if (BIC < best_score): best_score = BIC best_model = model except: pass return best_model
def profileFeature(data, kmeans_hour, kmeans_day, labels_hour, labels_day, temp8760): staticFeatures = [ data.max(), data.min(), data.median(), data.mean(), data.std(), np.mean(np.fft.fft(data)), np.std(np.fft.fft(data)), kmeans_hour, kmeans_day ] n_hidden_states = 5 hmm_hour = GaussianHMM(n_components=n_hidden_states) hmm_hour.fit(labels_hour.reshape(-1, 1)) transmat_hour = hmm_hour.transmat_ # 转移特性矩阵 entropy_hour = getEntropy(labels_hour) # 行为信息熵 hmm_day = GaussianHMM(n_components=n_hidden_states) hmm_day.fit(labels_day.reshape(-1, 1)) transmat_day = hmm_day.transmat_ # 转移特性矩阵 entropy_day = getEntropy(labels_day) # 行为信息熵 dynamicFeatures = [transmat_hour, entropy_hour, transmat_day, entropy_day] plotTempFeature(data, temp8760) return staticFeatures, dynamicFeatures
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float("-inf") best_model = None if len(self.sequences) < 2: return best_model kf = KFold(n_splits=self.splits()) for index_components in range(self.min_n_components, self.max_n_components +1 ): summing_score = 0 count = 0 for cv_train, cv_test in kf.split(self.sequences): new_model = None try: train_X, train_lengths = combine_sequences(cv_train, self.sequences) test_X, test_lenghts = combine_sequences(cv_test, self.sequences) new_model = GaussianHMM(n_components = index_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_lengths) summing_score += hmm_model.score(test_X, test_lenghts) count += 1 except: pass if count > 0: new_score = summing_score / count else: new_score = 0 if new_score > best_score: best_score = new_score best_model = new_model return best_model
def get_model_cv(self, num_of_state, split_method=None): l_list = [] seqs = self.sequences # When sample size is too small to have fold return cv as logL if not split_method: try: fullX, fulllengths = sequence_2_Xlengths(seqs) m = GaussianHMM(n_components=num_of_state, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(fullX, fulllengths) l = m.score(fullX, fulllengths) return l except: return float("-inf") for cv_train_idx, cv_test_idx in split_method.split(seqs): try: train_sequences = [seqs[k] for k in cv_train_idx] trainX, trainlengths = sequence_2_Xlengths(train_sequences) test_sequences = [seqs[k] for k in cv_test_idx] testX, testlengths = sequence_2_Xlengths(test_sequences) m = GaussianHMM(n_components=num_of_state, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(trainX, trainlengths) l = m.score(testX, testlengths) l_list.append(l) except: pass # Check the case if all model cannot score if len(l_list) == 0: cv = float("-inf") else: cv = np.mean(l_list) return cv
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # Variables to hold update scores best_bic = math.inf best_model = GaussianHMM() # Iterate across a range of model states for num_hidden_states in range(self.min_n_components, self.max_n_components + 1): try: # Fit a model based on state model = GaussianHMM(n_components=num_hidden_states, n_iter=100) model.fit(self.X, self.lengths) # Values needed to for BIC # From the slides: http://www2.imm.dtu.dk/courses/02433/doc/ch6_slides.pdf # BIC = −2 log L + p log N, # Get WITHIN sample logL logL = model.score(self.X, self.lengths) # Compute the number of parameters num_parameters = num_hidden_states * num_hidden_states + 2 * num_hidden_states * len( self.X[0]) - 1 # Compute overall BIC formula current_bic = (-2) * logL + num_parameters * math.log( len(self.X)) # Control flow to update BIC score if current_bic <= best_bic: best_model, best_bic = model, current_bic else: continue except: continue return best_model
def train_all(df): models = {} words = df["gesture"].unique() for word in words: dataword = df[df["gesture"] == word] speakers = dataword["speaker"].unique() lengths = [] for speaker in speakers: lengths.append(len(dataword[dataword["speaker"] == speaker])) dataword = dataword.drop(columns=[ dataword.columns[56], dataword.columns[57], dataword.columns[58] ]) dataword = (dataword - dataword.min()) / (dataword.max() - dataword.min()) dataword = dataword.fillna(0.0) #BAYESIAN INFORMATION CRITERION FOR SELECTING THE BEST MODEL #best_score,best_model=float("inf"),None #print([word,len(dataword),lengths]) models[word] = GaussianHMM(n_components=11, covariance_type="spherical", n_iter=1000).fit(dataword, lengths) return models
def cv_model(self, num_states, training_X, training_lengths): warnings.filterwarnings("ignore", category=DeprecationWarning) #training_X, training_lengths = combine_sequences(training_fold_idx, # self.X) try: hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(training_X, training_lengths) if self.verbose: print("training model created for {} with {} states based on\ dataset {}".format(self.this_word, num_states, training_X)) return hmm_model except: if self.verbose: print("model creation failed for {} with {} states based on\ dataset {}".format(self.this_word, num_states, training_X)) return None
def fit(self, tr_seqs): self.tr_seqs = tr_seqs self.n_classes = len(tr_seqs) self.models = [] for class_seqs in tr_seqs: lengths = [seq.shape[0] for seq in class_seqs] X = np.vstack(class_seqs) print X.shape, len(lengths) start_prob = np.ones(self.n_components) start_prob /= np.sum(start_prob) transmat = np.ones((self.n_components, self.n_components)) for i in range(self.n_components): transmat[i, :] /= transmat[i, :].sum() trained = False with warnings.catch_warnings(): warnings.filterwarnings('error') while not trained: try: model = GaussianHMM(n_components=self.n_components, covariance_type='diag', n_iter=50, startprob_prior=start_prob, transmat_prior=transmat)\ .fit(X, lengths) trained = True except RuntimeWarning as w: print w print start_prob print transmat print lengths print X start_prob = np.random.random(self.n_components) transmat = np.random.random( (self.n_components, self.n_components)) self.models.append(model)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) max_score = None max_model = None for n in range(self.min_n_components, self.max_n_components + 1): try: all_score = 0.0 qty = 0 final_model = None if (len(self.sequences) >= 2): # Generate K folds folds = min(len(self.sequences),3) split_method = KFold(shuffle=True, n_splits=folds) parts = split_method.split(self.sequences) for cv_train_idx, cv_test_idx in parts: # Kfold information for train X_train, lengths_train = np.asarray(combine_sequences(cv_train_idx, self.sequences)) # Kfold information for test X_test, lengths_test = np.asarray(combine_sequences(cv_test_idx, self.sequences)) # Fit model with train data model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_train, lengths_train) # Get score using test data all_score = all_score+model.score(X_test,lengths_test) qty = qty+1 # Calculate score score = all_score / qty else: # cant be fold final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) score = model.score(self.X, self.lengths) # Keep model with best score if max_score is None or max_score < score: max_score = score if final_model is None: final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) max_model = final_model except: pass return max_model
def create_combined_hmm(model): list_pi = [model[appliance].startprob_ for appliance in model] list_A = [model[appliance].transmat_ for appliance in model] list_means = [model[appliance].means_.flatten().tolist() for appliance in model] pi_combined = compute_pi_fhmm(list_pi) A_combined = compute_A_fhmm(list_A) [mean_combined, cov_combined] = compute_means_fhmm(list_means) combined_model = GaussianHMM(n_components=len(pi_combined), covariance_type='full') combined_model.startprob_ = pi_combined combined_model.transmat_ = A_combined combined_model.covars_ = cov_combined combined_model.means_ = mean_combined return combined_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on DIC scores DIC = [] #track the DIC hidden_states = [] #track the number of hidden_states rest_words = list(self.words) #list rest_words.remove(self.this_word) for num_hidden_states in range( self.min_n_components, self.max_n_components + 1): #for each possible number of hidden states try: #if the hmmlearn library can train or score the model rest_logL = 0 hmm_model = self.base_model(num_states=num_hidden_states) logL = hmm_model.score(self.X, self.lengths) rest_num_scorable_words = 0 for word in rest_words: X, lengths = self.hwords[word] try: #if the hmmlearn library can score the model rest_logL = rest_logL + hmm_model.score(X, lengths) rest_num_scorable_words = rest_num_scorable_words + 1 except: #if the hmmlearn library cannot score the model print('{0} is not scorable!'.format(word)) DIC.append(logL - rest_logL / rest_num_scorable_words) hidden_states.append(num_hidden_states) except: #if the hmmlearn library cannot train or score the model pass #now see which number of hidden states gave the largest DIC try: optimal_num_hidden_states = hidden_states[DIC.index(max(DIC))] optimal_hmm_model = GaussianHMM( n_components=optimal_num_hidden_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) return optimal_hmm_model except ValueError: #if the hmmlearn library cannot train a single model for all possible number of hidden states pass
def fit(self, data): """ Estimates model parameters by initializing a Gaussian HMM for each class label and fitting data for that model :param data: matrix with the dimensions [number of datapoints][2][1 or 2] In the first matrix dimension, each datapoint will be stored. In the second dimension, at index 0, the veracity label of a given rumour will be stored. At index 1, the features will be stored. The third dimension will be of size 1 or 2, depending on whether only SDQC labels are used for the prediction, or timestamps are also included as features. :return: the HMM model, with sub-models fitted for each data label """ classes = dict() feature_count = len(data[1][1][0]) # partition data in labels for datapoint in data: if datapoint[0] not in classes: classes[datapoint[0]] = [] classes[datapoint[0]].append(datapoint[1]) # Make and fit model for each label for veracity_label, sdqc_labels in classes.items(): lengths = [len(x) for x in sdqc_labels] thread_flat = np.array(flatten(sdqc_labels)).reshape( -1, feature_count) if veracity_label not in self.models: if self.model_type == 'gaussian': self.models[veracity_label] = GaussianHMM( n_components=self.components).fit(thread_flat, lengths=lengths) elif self.model_type == 'multinomial': # If timestamps are used, the MultinomialHMM ignores these, as it does not support float values thread_flat = [[int(x[0])] for x in thread_flat] self.models[veracity_label] = MultinomialHMM( n_components=self.components).fit(thread_flat, lengths=lengths) return self
def fitHMM(logAnnualQ_cut): # initialize matrices to store moments, transition probabilities, # stationary distribution and quantiles of Gaussian HMM for each site nSites = np.shape(logAnnualQ_cut)[1] mus = np.zeros([2, nSites]) sigmas = np.zeros([2, nSites]) P = np.zeros([2, 2, nSites]) pi = np.zeros([2, nSites]) for i in range(np.shape(logAnnualQ_cut)[1]): # fit to last 2/3 of historical record hmm_model = GaussianHMM(n_components=2, n_iter=1000).fit( np.reshape(logAnnualQ_cut[35::, i], [len(logAnnualQ_cut[35::, i]), 1])) # find means (mus) and standard deviations (sigmas) of Gaussian mixture distributions mus[:, i] = np.reshape(hmm_model.means_, hmm_model.means_.size) sigmas[:, i] = np.reshape( np.sqrt( np.array([ np.diag(hmm_model.covars_[0]), np.diag(hmm_model.covars_[1]) ])), hmm_model.means_.size) # find transition probabilities, P P[:, :, i] = hmm_model.transmat_ if mus[0, i] > mus[1, i]: mus[:, i] = np.flipud(mus[:, i]) sigmas[:, i] = np.flipud(sigmas[:, i]) P[:, :, i] = np.fliplr(np.flipud(P[:, :, i])) # find stationary distribution, pi eigenvals, eigenvecs = np.linalg.eig(np.transpose(P[:, :, i])) one_eigval = np.argmin(np.abs(eigenvals - 1)) pi[:, i] = eigenvecs[:, one_eigval] / np.sum(eigenvecs[:, one_eigval]) return mus, sigmas, P, pi
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('-inf') average_score = float('-inf') best_model = None for n_components in range(self.min_n_components, self.max_n_components + 1): #going into K-folds #1. define kfold #2. define model #3. fit model with train, score with testing, record scores inside_scores = [] if len(self.lengths) <= 2: #GaussianHMM takes in a numpy array and a list #print('short') try: #print(n_components, self.this_word, 'with length: ', len(self.lengths)) model = GaussianHMM(n_components = n_components, covariance_type = 'diag', n_iter = 1000, verbose = self.verbose, random_state = self.random_state).fit(self.X, self.lengths) inside_scores.append(model.score(self.X, self.lengths)) except: print('some error with ', self.this_word) else: #print('long, kfold') kf = KFold() for train, test in kf.split(self.sequences): x_train, length_train = combine_sequences(train, self.sequences) x_test, length_test = combine_sequences(test, self.sequences) #GaussianHMM takes in a numpy array and a list try: #print(n_components, self.this_word, 'with length: ', len(self.lengths)) model = GaussianHMM(n_components = n_components, covariance_type = 'diag', n_iter = 1000, verbose = self.verbose, random_state = self.random_state).fit(x_train, length_train) inside_scores.append(model.score(x_test, length_test)) except: #print('some error with ', self.this_word) pass average_score = np.mean(inside_scores) if average_score > best_score: best_model = model best_score = average_score #print('best score: ',best_score) return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # having the default score and model at the beginning dic_score = -99999999999 saved_model = self.base_model(self.n_constant) # iterating through min to max number of components to find the best one for i in range(self.min_n_components, self.max_n_components + 1): try: # getting the model hmm_model = GaussianHMM(n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) # getting the log likelihood of the current example logL_i = hmm_model.score(self.X, self.lengths) except: # if fails, will continue to next word continue logL_rest = 0 # will get the model of all other examples to calculate dic score for key in self.words: if key == self.this_word: continue X_temp, lengths_temp = self.hwords[key] try: hmm_model_temp = GaussianHMM( n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_temp, lengths_temp) # accumulating log likelihood of all examples logL_rest += hmm_model_temp.score(X_temp, lengths_temp) except: continue coeff = 1 / (len(self.words) - 1) dic_current = logL_i - coeff * logL_rest # comparing for the best score if dic_current > dic_score: saved_model = hmm_model dic_score = dic_current return saved_model
class HMMGoalModel(object): def __init__(self, per_data, per_lens=None, n_states=None): if per_lens is None: per_lens = list(map(len, per_data)) if len(per_data.shape) > 2: per_data = per_data.reshape(-1, per_data.shape[-1]) if n_states is None: components = [2, 4, 6, 8, 10] hmms = [GaussianHMM(n_components=c) for c in components] map(lambda g: g.fit(per_data, per_lens), hmms) scores = map(lambda g: aic(g, per_data, per_lens), hmms) max_score, self.hmm = sorted(zip(scores, hmms))[0] else: self.hmm = GaussianHMM(n_components=n_states) self.hmm.fit(per_data, per_lens) ll = self.hmm.score(per_data, per_lens) print "Goal HMM n_components", self.hmm.n_components, "Log likelihood", ll upper_idxs = [per_lens[0] - 1] start_idxs = [0] for i in range(1, len(per_lens)): upper_idxs.append(upper_idxs[i - 1] + per_lens[i]) start_idxs.append(start_idxs[i - 1] + per_lens[i - 1]) self.final_states = np.array(self.hmm.predict(per_data, per_lens))[upper_idxs] print self.final_states self.T = int(np.mean(per_lens)) self.n_components = self.hmm.n_components def is_success(self, per_trj): per_trj = np.array(per_trj) states = self.hmm.predict(per_trj) final_state = states[-1] return final_state in self.final_states def sample(self, t=None): t = self.T if t is None else t return self.hmm.sample(t)
def base_model(self, num_states): # with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=RuntimeWarning) #covariance_type = "full" covariance_type = "diag" #if(len(self.sequences) < 3): # GaussianHMM value error issues if this is less than 3 # covariance_type = 'diag' try: hmm_model = GaussianHMM(n_components=num_states, covariance_type=covariance_type, n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) if self.verbose: print("model created for {} with {} states".format( self.this_word, num_states)) return hmm_model except: if self.verbose: print("failure on {} with {} states".format( self.this_word, num_states)) return None
import numpy as np import matplotlib.pyplot as plt from hmmlearn.hmm import GaussianHMM # 从输入文件中加载数据 input_file = 'CNY.csv' data = np.loadtxt(input_file, delimiter=',') # 提取需要的值 closing_values = np.array(data[:, 6]) volume_of_shares = np.array(data[:, 8])[:-1] # 计算每天收盘价变化率 diff_percentage = 100.0 * np.diff(closing_values) / closing_values[:-1] # 将变化率与交易量组合起来 X = np.column_stack((diff_percentage, volume_of_shares)) # 创建并训练高斯隐马尔科夫模型 print(u"训练高斯隐马尔科夫模型中......") model = GaussianHMM(n_components=5, covariance_type='diag', n_iter=1000) model.fit(X) # 用模型生成数据 num_samples = 500 samples, _ = model.sample(num_samples) plt.plot(np.arange(num_samples), samples[:, 0], c='black') plt.figure() plt.plot(np.arange(num_samples), samples[:, 1], c='red') plt.show()
feature_test = test_df[feature_name] activity_test = test_df[activity_name] data_feature = feature_data.as_matrix() data_label = activity_data.as_matrix() test_feature = feature_test.as_matrix() test_label = activity_test.as_matrix() lengths = data_feature.shape[0] # --- Run Gaussian HMM --- # print "fitting to HMM and decoding ..." # --- Make an HMM instance and execute fit --- # model = GaussianHMM(n_components=5, covariance_type="diag", n_iter=1000).fit(data_feature) # --- Predict the optimal sequence of internal hidden state FOR DATA CSV!--- # # --- the following is generating figure #1, and it predicts state sequence from DATA csv --- # hidden_states = model.predict(data_feature) time_axis = np.asarray(range(len(hidden_states))) # --- fancy plots of different states in HMM --- # fig1_data,axs = plt.subplots(model.n_components, sharex=True, sharey=True) fig1_data.suptitle('Estimated State Sequence for Training Data') colours = cm.rainbow(np.linspace(0, 1, model.n_components)) for i, (ax, colour) in enumerate(zip(axs, colours)): # --- Use fancy indexing to plot data in each state --- # mask = hidden_states == i ax.plot(time_axis[mask], data_feature[:,1][mask], ".", c=colour)
import matplotlib.pyplot as plt from hmmlearn.hmm import GaussianHMM from convert_to_timeseries import convert_data_to_timeseries # 从输入文件中加载数据 input_file = 'data_hmm.txt' data = np.loadtxt(input_file, delimiter=',') # 排列训练数据 X =np.column_stack([data[:, 2]]) # 创建并训练高斯HMM模型 print(u"训练高斯HMM模型") num_components = 4 model = GaussianHMM(n_components=num_components, covariance_type='diag',n_iter=1000) model.fit(X) # 预测HMM的隐藏状态 hidden_states = model.predict(X) # 计算这些隐藏状态的均值和方差 print(u"隐藏状态的均值和方差") for i in range(model.n_components): print(u"隐藏状态:{}".format(i+1)) print(u"均值:{:.3f}".format(model.means_[i][0])) print(u"方差:{:.3f}".format(np.diag(model.covars_[i])[0])) # 用模型生成数据 num_samples = 1000 samples, _ = model.sample(num_samples)
class CasmlApproximator(FunctionApproximator): """ """ class _RetentionMethod(RetentionMethod): """The retention method for the transition case base implementation for :class:`Casml`. When the new problem-solving experience can be stored or not stored in memory, depending on the revision outcomes and the CBR policy regarding case retention. Parameters ---------- owner : CaseBase A pointer to the owning case base. tau : float, optional The maximum permitted error when comparing most similar solution. Default is 0.8. sigma : float, optional The maximum permitted error when comparing actual with estimated transitions. Default is 0.2 plot_retention_method : callable, optional Callback function plotting the retention step. Default is None. Notes ----- The Casml retention method for the transition case base considers query cases as predicted correctly if both: 1. the difference between the actual and the estimated transitions are less than or equal to the permitted error :math:`\\sigma`: .. math:: d(\\text{case}.\\Delta_\\text{state}, T(s_{i-1}, a_{i-1}) <= \\sigma 2. and the query case is within the maximum permitted error :math:`\\tau` of the most similar solution case: .. math:: d(\\text{case}, 1\\text{NN}(C_T, \\text{case})) <= \\tau """ def __init__(self, owner, tau=None, sigma=None, plot_retention_method=None): super(CasmlApproximator._RetentionMethod, self).__init__(owner, plot_retention_method, {'tau': tau, 'sigma': sigma}) self._tau = tau if tau is not None else 0.8 """:type: float""" self._sigma = sigma if sigma is not None else 0.2 """:type: float""" def execute(self, features, matches, plot=True): """Execute the retention step. Parameters ---------- features : list[tuple[str, ndarray]] A list of features of the form (`feature_name`, `data_points`). matches : dict[str, dict[int, tuple[float, ndarray]]] The solution identified through the similarity measure. plot: bool, optional Plot the data during the retention step. Returns ------- int : The case id if the case was retained, -1 otherwise. """ f = dict(features) do_add = True if matches: for id_, val in matches['state'].iteritems(): delta_error = np.linalg.norm(self._owner.get_feature('delta_state', id_).value - f['delta_state']) if delta_error <= self._sigma: # At least one of the cases in the case base correctly estimated the query case, # the query case does not add any new information, do not add. do_add = False break basis_id = -1 if do_add or matches['state'].values()[0][0] > self._tau: basis_id = self._owner.insert(features, matches) if plot: self.plot_data(features, matches) return basis_id class Approximation(FunctionApproximator.Approximation): """ """ def __init__(self, approximator, state, act, kernelfn): super(CasmlApproximator.Approximation, self).__init__(state) self._act = act self._approximator = approximator """:type: CasmlApproximator""" self._kernelfn = kernelfn self._sum = 0.0 self._neighbors = [] """:type: list""" self._deltas = [] """:type: list""" self.update(state.features, act.features) def __del__(self): assert (self.state, Hashable(self._act.features)) not in self._approximator._queries # noinspection PyTypeChecker # if not next((True for elem in self._approximator._fit_X if np.all(elem == self.state.features)), False): if (self.state, Hashable(self._act.features)) not in self._approximator._bases: self._approximator._querycb.remove([('state', self.state.features), ('act', self._act.features)]) def include(self, d, state, delta): assert d >= 0 val = (d, state) if len(self._neighbors) <= 0 or val < self._neighbors[-1]: # noinspection PyTypeChecker # if not next((True for (dist, v) in self._neighbors if dist == d and np.all(v == state)), False): ind = bisect.bisect_left(self._neighbors, val) bisect.insort(self._neighbors, val) self._deltas.insert(ind, delta) self._compute_weights() self.dispatch('average_change') else: assert self._sum > 0.0 w = self._kernelfn(d) if w / self._sum >= self._approximator._minfraction: self._neighbors.append(val) self._deltas.append(delta) self._compute_weights() self.dispatch('average_change') def update(self, state, act): neighbors = dict(self._approximator._basiscb.retrieve([('state', state), ('act', act)])) if 'state' in neighbors: self._deltas = [self._approximator._basiscb.get_feature('delta_state', id_).value for id_ in neighbors['state'].iterkeys()] self._neighbors = neighbors['state'].values() self._compute_weights() def _compute_weights(self): self._weights.clear() self._sum = 0.0 i = 0 total = 0 # calculate successor states from the current state and solution delta state for (d, succ), delta in zip(self._neighbors, self._deltas): w = self._kernelfn(d) if self._sum == 0.0 or w / self._sum >= self._approximator._minfraction: sequence = [np.asarray(self._state.features), np.asarray(self._state.features + delta)] proba = np.exp(self._approximator._hmm.score(sequence)) self._weights[MDPState.create(succ)] = (w, proba) # proba self._sum += w total += proba i += 1 else: break del self._neighbors[i:] del self._deltas[i:] for succ, (w, p) in self._weights.iteritems(): self._weights[succ] = (w, p / total) # total pass # sequences = np.zeros((len(self._neighbors), 2, len(self._state)), dtype=float) # # for i, delta in enumerate(self._deltas): # sequences[i, 0] = np.array(self._state.features) # sequences[i, 1] = np.asarray(self._state.features + delta) # # # use HMM to calculate probability for observing sequence <current_state, next_state> # # noinspection PyTypeChecker # weights = np.exp(self._approximator._hmm.score(sequences)) # for (_, succ), w in zip(self._neighbors, weights): # self._weights[MDPState.create(succ)] = w # # sum_ = weights.sum() # for (_, succ), w in zip(self._neighbors, weights): # if len(weights) <= 1: # w *= 0.9 # self._weights[MDPState.create(succ)] = w / sum_ # ----------------------------- # CasmlApproximator # ----------------------------- def __init__(self, feature_metadata, minfraction, scale, kernelfn, tau=None, sigma=None, ncomponents=1, n_iter=1): super(CasmlApproximator, self).__init__() self._minfraction = minfraction self._scale = scale self._kernelfn = kernelfn self._new_sequence = True #: Contains all the existing CasmlAppoximations created by #: this CasmlApproximator. The keys serve as both queries and #: bases (queries are a superset of bases), so a datum may be #: None if the associated key is just a basis, not a query. self._queries = weakref.WeakValueDictionary() """:type: dict[tuple[MDPState, MDPAction], Approximation]""" #: The subset of keys of queries that are also bases. #: The order in which the bases have been received is preserved self._bases = set() """:type: set[tuple[MDPState, Hashable]""" self._fit_X = [] """:type: list[ndarray]""" #: The case base maintaining the observations in the form #: c = <s, a, ds>, where ds = s_{i+1} - s_i #: to identify possible successor states. self._basiscb = CaseBase(feature_metadata, retention_method=self._RetentionMethod, retention_method_params=(tau, sigma), name='basiscb') """:type: CaseBase""" del feature_metadata['delta_state'] #: Invariant: contains all the keys in queries self._querycb = CaseBase(feature_metadata, name='querycb') """:type: CaseBase""" #: The hidden Markov model maintaining the observations in the form #: seq = <s_{i}, s_{i+1}> #: to reason on the transition probabilities of successor states. self._hmm = GaussianHMM(ncomponents, n_iter=n_iter) # , covariance_type='full' # self._hmm = GaussianHMM(ncomponents) """:type: GaussianHMM""" self._not_add_bases = 0 self._not_add_count = 0 def initialize(self): """Prepare for a new episode.""" self._new_sequence = True def add_basis(self, state, act, succ=None): """Adds a state to the set of bases used to approximate query states. Parameters ---------- state : MDPState The state to add act : MDPAction The action performed in that state succ : MDPState: The successor state. Returns ------- MDPState : The approximated state. """ # update the hmm with the new sequence self._fit_hmm(state, succ) # retain the case in the query case base features = [('state', state.features), ('act', act.features)] self._querycb.retain(features) a = Hashable(act.features) if (state, a) in self._bases: self._not_add_bases += 1 return state self._bases.add((state, a)) # retain the case in the basis case base if succ is None: succ = state delta = succ - state features.append(('delta_state', delta)) basis_id = self._basiscb.run(features) if basis_id <= -1: self._not_add_count += 1 if basis_id >= 0: if self._querycb.similarity_uses_knn: for c in self._querycb.itervalues(): try: approx = self._queries[(MDPState.create(c['state'].value), Hashable(c['act'].value))] except KeyError: pass else: approx.update(c['state'].value, c['act'].value) else: neighbors = dict(self._querycb.retrieve([('state', state.features), ('act', act.features)])) for id_, (d, s) in neighbors['state'].iteritems(): try: approx = self._queries[(MDPState.create(s), Hashable(neighbors['act'][id_][1]))] except KeyError: pass else: approx.include(d, state.features, delta) return state def approximate(self, state, act): """Approximates a given state using an Approximation. Parameters ---------- state : MDPState The state to approximate. act : MDPAction The action performed in that state Returns ------- Approximation : The Approximation approximating state. """ self._querycb.retain([('state', state.features), ('act', act.features)]) a = Hashable(act.features) try: approx = self._queries[(state, a)] except KeyError: approx = CasmlApproximator.Approximation(self, state, act, self._kernelfn) self._queries[(state, a)] = approx return approx def _fit_hmm(self, state, succ): # try: # x = self._hmm._fit_X.copy() # except AttributeError: # x = np.zeros(1, dtype=np.object) # else: # if self._new_sequence: # x = self._hmm._fit_X.tolist() # x.append(np.zeros(1)) # x = np.array(x) # # if self._new_sequence: # self._new_sequence = False # x[-1] = np.hstack([np.reshape(state.features, (-1, state._nfeatures)).T]) # # x[-1] = np.hstack([x[-1].tolist(), np.reshape(succ.features, (-1, succ._nfeatures)).T]) # self._hmm.fit(x, n_init=1) if self._new_sequence: self._new_sequence = False self._fit_X.append([]) self._fit_X[-1].append(state.features) if succ is not None: self._fit_X[-1].append(succ.features) self._hmm.fit(np.concatenate(self._fit_X), lengths=[len(x) for x in self._fit_X])
# Take diff of close value. Note that this makes # ``len(diff) = len(close_t) - 1``, therefore, other quantities also # need to be shifted by 1. diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] # Pack diff and volume for training. X = np.column_stack([diff, volume]) ############################################################################### # Run Gaussian HMM print("fitting to HMM and decoding ...", end="") # Make an HMM instance and execute fit model = GaussianHMM(n_components=4, covariance_type="diag", n_iter=1000).fit(X) # Predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print("done") ############################################################################### # Print trained parameters and plot print("Transition matrix") print(model.transmat_) print() print("Means and vars of each hidden state") for i in range(model.n_components): print("{0}th hidden state".format(i))