def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on DIC scores # raise NotImplementedError record = float("-inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) tmp = 0 for word in self.hwords: X, lengths = self.hwords[word] tmp += model.score(X,lengths) DIC = logL - (tmp-logL) /(len(self.hwords)-1) if DIC > record: record = DIC hmm_model = model except: continue # print("failure on {} with {} states".format(self.this_word, num)) return hmm_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float("-inf") best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: other_words_score = 0.0 quantity = 0.0 model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) this_word_score = model.score(self.X, self.lengths) for word in self.hwords: if word != self.this_word: quantity += 1 X, lengths = self.hwords[word] other_words_score += model.score(X, lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 score = this_word_score - other_words_score / quantity if score > best_score: best_score = score best_model = model except: continue return best_model
def get_best_hmm_model(X, max_states, max_iter = 10000): best_score = -(10 ** 10) best_state = 0 for state in range(1, max_states + 1): hmm_model = GaussianHMM(n_components = state, random_state = 100, covariance_type = "diag", n_iter = max_iter).fit(X) if hmm_model.score(X) > best_score: best_score = hmm_model.score(X) best_state = state best_model = GaussianHMM(n_components = best_state, random_state = 100, covariance_type = "diag", n_iter = max_iter).fit(X) return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) n_components = range(self.min_n_components, self.max_n_components + 1) best_n = self.random_state best_DIC = float('-inf') best_model = None for n in n_components: try: w_count = 0 model = GaussianHMM(n, n_iter=1000).fit(self.X, self.lengths) original_prob = model.score(self.X, self.lengths) sum_prob_others = 0.0 for word in self.words: if word == self.this_word: continue X_other, lengths_other = self.hwords[word] #other_model = GaussianHMM(n,n_iter=1000).fit(X_other,lengths_other) #Edited-commented logL = model.score(X_other, lengths_other) sum_prob_others += logL w_count = w_count + 1 avg_prob_others = sum_prob_others / w_count DIC = original_prob - avg_prob_others #print('num_comp: {} for DIC:'.format(n,DIC)) if DIC > best_DIC: best_DIC = DIC best_n = n except: pass #if (len(self.lengths) == 1): # #and (self.lengths[0] <= (len(self.X))/2): # print ("length is equal. Can we process it? " + "length <= " + str(self.lengths[0]) + " X " + str(len(self.X)/2) ) # print (self.X, self.lengths) try: best_model = GaussianHMM(best_n, n_iter=1000).fit(self.X, self.lengths) except ValueError: print("length is equal. Can we process it? " + "length <= " + str(self.lengths[0]) + " X " + str(len(self.X))) #print ("Clusters is " + best_model.n_components) print(self.X, self.lengths) best_model = None print("Exiting...") return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score, best_n_components = None, None for n_components in range(self.min_n_components, self.max_n_components + 1): scores, n_splits = [], 3 if (len(self.sequences) < 3): try: model = GaussianHMM(n_components=n_components, n_iter=1000).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) if (best_score is None or logL > best_score): best_score, best_n_components = logL, n_components except Exception as e: # Skip cross-validation for current n_components continue else: split_method = KFold(random_state=self.random_state, n_splits=n_splits) for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) try: model = GaussianHMM(n_components=n_components, n_iter=1000).fit( X_train, lengths_train) logL = model.score(X_test, lengths_test) scores.append(logL) except Exception as e: break training_successful = len(scores) == n_splits if (not training_successful): continue avg = np.average(scores) if (best_score is None or avg > best_score): best_score, best_n_components = avg, n_components if (best_score == None): best_n_components = 3 return self.base_model(best_n_components)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('-inf') if len(self.sequences) == 1: isSplit = False else: isSplit = True nFolds = min(3, len(self.sequences)) split_method = KFold(n_splits=nFolds) for nStates in list( range(self.max_n_components, self.max_n_components + 1)): try: cv_score = 0 if isSplit: for train_idx, test_idx in split_method.split( self.sequences): train_X, train_lengths = combine_sequences( train_idx, self.sequences) test_X, test_lengths = combine_sequences( test_idx, self.sequences) cv_model = GaussianHMM(n_components=nStates, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_X, train_lengths) cv_score += cv_model.score(test_X, test_lengths) avg_score = cv_score / nFolds else: cv_model = GaussianHMM(n_components=nStates, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) avg_score = cv_model.score(self.X, self.lengths) if avg_score > best_score: best_score = avg_score best_nStates = nStates logging.debug( "CV better score for {} with {} states".format( self.this_word, nStates)) except ValueError: logging.debug("CV ValueError on {} with {} states".format( self.this_word, nStates)) return self.base_model(nStates) return self.base_model(best_nStates)
def get_best_hmm_model(X, max_iter=10000, max_states=6): best_score = -(10 ** 10) best_state = 0 for state in range(1, max_states + 1): hmm_model = GaussianHMM(n_components=state, random_state=100, covariance_type='diag', n_iter=max_iter).fit(X) try: if hmm_model.score(X) > best_score: best_score = hmm_model.score(X) best_state = state except: continue best_model = GaussianHMM(n_components=best_state, random_state=100, covariance_type='diag', n_iter=max_iter).fit(X) return best_model
def select(self): # Use these variables to store best model bestDIC = None bestModel = None # Iterate over all possible models for num_states in range(self.min_n_components, self.max_n_components + 1): try: # Create new Gaussian HMM hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=self.verbose) if self.verbose: print("model created for {} with {} states".format( self.this_word, num_states)) # Fit model with current data hmm_model.fit(self.X, self.lengths) # Calculate logL logL = hmm_model.score(self.X, self.lengths) otherScores = 0 # Calculate likelihood SUM for all other words for otherWord in self.hwords: if otherWord != self.this_word: otherScores += hmm_model.score(*self.hwords[otherWord]) # Caluclate dicusing formula DIC = log(P(X(i)) - 1/(M-1)SUM(log(P(X(all but i)) dic = logL - (float(1) / (len(self.hwords) - 1)) * otherScores # Find model with highest DIC if bestDIC is None or dic > bestDIC: bestModel = hmm_model bestDIC = dic except: if self.verbose: print("failure on {} with {} states".format( self.this_word, num_states)) return bestModel
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) this_word = self.this_word sequences = self.sequences hmm_models = {} best_score = None best_i = None for i in range(self.min_n_components, self.max_n_components): try: model = GaussianHMM(n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) hmm_models[i] = model logL = model.score(self.X, self.lengths) sum_logL = 0 num_of_logs = 0 for word in self.hwords.keys(): if word == this_word: continue try: X2, lengths2 = self.hwords[word] logL = model.score(X2, lengths2) / len(lengths2) sum_logL += logL num_of_logs += 1 except: pass dic = logL if num_of_logs: dic -= sum_logL / num_of_logs if best_score is None or best_score < dic: best_score = dic best_i = i except ValueError: hmm_models[i] = None if best_i is None: return None return hmm_models[best_i]
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on BIC scores # raise NotImplementedError record = float("inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # p is the number of free parameters, N is the number of data points p = num*num + 2* num* len(self.X[0]) -1 BIC = -2* logL + p * np.log(len(self.X)) if BIC < record: record = BIC hmm_model = model except: continue # print("failure on {} with {} states".format(self.this_word, num)) return hmm_model
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('inf') best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 p = n ** 2 + 2 * n * len(self.X[0]) - 1 N = len(self.X) score = -2 * logL + p * np.log(N) if score < best_score: best_score = score best_model = model except: continue return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV # best_model = None best_score = float('-inf') best_n = self.max_n_components split_method = KFold() scores = [] for n in range(self.min_n_components, self.max_n_components + 1): try: for cv_train_idx, cv_test_idx in split_method.split( self.sequences): train_X, train_lengths = combine_sequences( cv_train_idx, self.sequences) test_X, test_lengths = combine_sequences( cv_test_idx, self.sequences) model = GaussianHMM(n_components=n).fit( train_X, train_lengths) score = model.score(test_X, test_lengths) scores.append(score) avg_score = np.mean(scores) if avg_score > best_score: best_score = avg_score best_n = n except: pass return self.base_model(best_n)
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # initialize variables hmm_model = None best_hmm_model = None feature_cnt = self.X.shape[1] best_b_i_c__score = float("inf") for num_states in range(self.min_n_components, self.max_n_components + 1): try: # train a model based on current number of components = num_states hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) # calculate likelihood log for the model log_l = hmm_model.score(self.X, self.lengths) # number of parameter p = num_states * (num_states + feature_cnt * 2 - 1) log_n = np.log(len(self.X)) # Calculate BIC score using the model parameters b_i_c__score = -2 * log_l + p * log_n except: b_i_c__score = float("inf") best_hmm_model = None # choose the best model if best_b_i_c__score > b_i_c__score: best_hmm_model = hmm_model best_b_i_c__score = b_i_c__score return best_hmm_model
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) best_n = self.min_n_components min = float("+inf") #calculate the BIC for each number of n_components.. for i in range(self.min_n_components, self.max_n_components + 1): try: model = GaussianHMM(n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) log_likelyhood = model.score(self.X, self.lengths) #now calculating the bic.. BIC=-2*LOGL + plogN p = i * i + 2 * i * len(self.X[0]) - 1 BIC = -2 * log_likelyhood + p * (math.log(len(self.X[0]))) #keeping track of the lowest value of BIC which would be the best for model.. if BIC < min: min = BIC best_n = i except: pass #return the best model.. return GaussianHMM(n_components=best_n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
def base_model(self, num_states, X=None, lens=None, testX=None, testlens=None): # with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=RuntimeWarning) if X is None: X = self.X if lens is None: lens = self.lengths try: hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X, lens) if self.verbose: print("model created for {} with {} states".format( self.this_word, num_states)) if testX is not None: return hmm_model, hmm_model.score(testX, testlens) return hmm_model except: if self.verbose: print("failure on {} with {} states".format( self.this_word, num_states)) return None
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on BIC scores best_model = None best_num_components = self.min_n_components best_bic = float('+inf') for num_states in range(self.min_n_components, self.max_n_components): try: # train model with training set hmm_model = GaussianHMM(n_components=num_states, n_iter=2000).fit(self.X, self.lengths) likelyhood = hmm_model.score(self.X, self.lengths) p = num_states ^ 2 + 2 * num_states * hmm_model.n_features - 1 # now calculate bic bic = -2 * likelyhood + p * np.log(hmm_model.n_features) if bic < best_bic: # new set of best numbers best_num_components, best_bic, best_model = num_states, bic, hmm_model except Exception: # if it fails, it will try again with the next set of elements, or simply return an empty model pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) if len(self.sequences) < 2: return self.base_model(3) if len(self.sequences) == 2: n_splits = 2 else: n_splits = 3 split_method = KFold(n_splits) logL = np.zeros([n_splits, self.max_n_components + 1 - self.min_n_components]) for pair_index, pairs in enumerate(split_method.split(self.lengths)): train, test = pairs train_X, train_length = combine_sequences(train, self.sequences) test_X, test_length = combine_sequences(test, self.sequences) for state_index, num_states in enumerate(range(self.min_n_components, self.max_n_components + 1)): logL[pair_index][state_index] = float('-inf') try: model = GaussianHMM(n_components=num_states, covariance_type='diag', n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_length) logL[pair_index][state_index] = model.score(test_X, test_length) except: continue best_num_states = self.min_n_components + np.argmax(logL.sum(axis=0)) return self.base_model(best_num_states)
def fit_hmm_learn(seqs, n_states, axis): """ Seqs is a list of numpy vectors """ samples = np.concatenate(seqs) lengths = np.array([len(s) for s in seqs]) if len(samples) < n_states: return float('inf'), float('-inf'), None, None # assert len(samples) >= n_states hmm = GaussianHMM(n_components=n_states) hmm.fit(samples, lengths) ll = hmm.score(samples, lengths) _, labels = hmm.decode(samples, lengths) axis.set_title("HMM Learn (ll=%0.2f)" % ll) # ax2.plot(means[:, 0], means[:, 1], 'ro') # ax2.plot(X[:, :, 0], X[:, :, 1], 'bo') possible_colors = ['orange', 'blue', 'green', 'red'] colors = [possible_colors[e] for e in labels] axis.scatter(seqs[:100, :, 0], seqs[:100, :, 1], color=colors[:100], marker='^') axis.scatter(seqs[100:200, :, 0], seqs[100:200, :, 1], color=colors[100:200], marker='o') axis.scatter(seqs[200:, :, 0], seqs[200:, :, 1], color=colors[200:], marker='s') return labels
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) split_method = KFold() best_n = self.min_n_components best_score = float("-inf") #Iterate for all possible number of states... for i in range(self.min_n_components, self.max_n_components + 1): try: count = 0 total = 0 #for each combination of folds which result due to split method, get the train and test samples.. for cv_train_idx, cv_test_idx in split_method.split( self.sequences): #now subsets must be combined based on indices given for the folds.. train_set, train_length = combine_sequences( cv_train_idx, self.sequences) test_set, test_length = combine_sequences( cv_test_idx, self.sequences) #now create a model using the training samples selected just now.. new_model = GaussianHMM(i, n_iter=1000).fit( train_set, train_length) #now calculate the score and test how well this newly created model is performing.. new_score = new_model.score(test_set, test_length) total = total + new_score count += 1 avg_score = total / count #this average score corresponds to the performance of the model using i number of n_components.. if (avg_score > best_score): best_score = avg_score best_n = i except: pass return GaussianHMM(best_n, n_iter=1000).fit(self.X, self.lengths)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV max_logL_model = [] max_logL_mean = -math.inf for n_component in range(self.min_n_components, self.max_n_components + 1): try: split_method = KFold(n_splits=min(3, len(self.sequences))) curr_logL = [] curr_model = [] for cv_train, cv_test in split_method.split(self.sequences): X_train, lengths_train = combine_sequences( cv_train, self.sequences) X_test, lengths_test = combine_sequences( cv_test, self.sequences) curr_model = GaussianHMM(n_components=n_component, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, lengths_train) curr_logL.append(curr_model.score(X_test, lengths_test)) curr_logL_mean = np.mean(curr_logL) if curr_logL_mean > max_logL_mean: max_logL_model = curr_model max_logL_mean = curr_logL_mean except: pass return max_logL_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = None best_score = float("-inf") for num_states in range(self.min_n_components, self.max_n_components): if len(self.sequences) == 1: continue split_method = KFold( n_splits=len(self.sequences) if len(self.sequences) < 3 else 3) iter_scores = [] for cv_train_idx, cv_test_idx in split_method.split( self.sequences): try: X, L = combine_sequences(cv_train_idx, self.sequences) hmmmodel = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X, L) X_test, L_test = combine_sequences(cv_test_idx, self.sequences) logL = hmmmodel.score(X_test, L_test) iter_scores.append(logL) except: continue avg_iter_score = np.average(iter_scores) if avg_iter_score > best_score: best_model = hmmmodel best_score = avg_iter_score return best_model
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on BIC scores best_model = None best_bic = float("inf") for n in range(self.min_n_components, self.max_n_components + 1): try: hmm_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) logL = hmm_model.score(self.X, self.lengths) p = (n * n) + (2 * n * self.X.shape[1]) - 1 bic = -2 * logL + p * np.log(len(self.X)) if bic < best_bic: best_bic = bic best_model = hmm_model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # implement model selection using CV best_score = float('-inf') best_model = None word_sequences = self.sequences split_method = KFold(n_splits=max(2, min(3, len(word_sequences)))) for num_states in range(self.min_n_components, self.max_n_components + 1): cv_fold_scores = [] try: for cv_train_idx, cv_test_idx in split_method.split( word_sequences): train_x, train_length = combine_sequences( cv_train_idx, word_sequences) test_x, test_length = combine_sequences( cv_test_idx, word_sequences) hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_x, train_length) cv_fold_scores.append(hmm_model.score(test_x, test_length)) cv_fold_mean = np.mean(cv_fold_scores) if cv_fold_mean > best_score: best_score = cv_fold_mean best_model = hmm_model except: continue return best_model
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # implement model selection based on BIC scores best_score = float('inf') best_model = None for num_states in range(self.min_n_components, self.max_n_components + 1): try: hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) logL = hmm_model.score(self.X, self.lengths) logN = math.log(len(self.X)) num_features = len(self.X[0]) p = num_states**2 + 2 * num_states * num_features - 1 bic = -2 * logL + p * logN if bic < best_score: best_score = bic best_model = hmm_model except: continue return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) # TODO implement model selection using CV best_score = float("-inf") best_model = self.base_model(self.n_constant) if len(self.sequences)<3: return best_model for n_components in range(self.min_n_components, self.max_n_components + 1): splits = KFold(min(3, len(self.sequences))).split(self.sequences) scores = [] for train, test in splits: train_X, train_lengths = combine_sequences(train, self.sequences) try: hmm_model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_lengths) test_X, test_lengths = combine_sequences(test, self.sequences) scores.append(hmm_model.score(test_X, test_lengths)) except: pass if np.average(scores) > best_score: best_score = np.average(scores) best_model = self.base_model(n_components) return best_model
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) # implement model selection based on BIC scores if self.verbose: print("="*10,"BIC","="*50); scores = {}; for n in range(self.min_n_components,self.max_n_components+1): if self.verbose: print("=== n = %d" % n); try: hmm_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) if self.verbose: print("model created for {} with {} states".format(self.this_word, n)) #BIC score #number of features is 4 for all, i.e, #grnd-ry, grnd-rx, grnd-ly, grnd-lx #norm-rx, norm-ry, norm-lx,norm-ly #delta-rx, delta-ry, delta-lx, delta-ly #norm-polar-rr, norm-polar-rtheta, norm-polar-lr, norm-polar-ltheta n_features = len(self.X[0]); #was hard coded to 4 p = n**2 + 2*n*n_features - 1; s = -2 * hmm_model.score(self.X,self.lengths) + p*np.log(len(self.lengths)); if self.verbose: print("Size X,lengths is %.2f,%.2f" % (len(self.X),len(self.lengths))); print("Score is: %f" % s); scores[s] = hmm_model; except: if self.verbose: print("failure on {} with {} states".format(self.this_word, n)) return scores[min(scores.keys())] if len(scores) > 0 else None;
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) if len(self.sequences) < 3: return self.base_model(self.n_constant) logL = float("-inf") best_model = None for state in range(self.min_n_components, self.max_n_components + 1): split_data = KFold() score = 0 count = 0 for train_index, test_index in split_data.split(self.sequences): try: X, lens = combine_sequences(train_index, self.sequences) model = GaussianHMM(n_components=state, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X, lens) X, lens = combine_sequences(test_index, self.sequences) score += model.score(X, lens) count += 1 except: continue if count: mean = score / count else: mean = float("-inf") if mean > logL: logL = mean best_model = model return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV bestScore = float("-inf") bestModel = None for nComponents in range(self.min_n_components, self.max_n_components + 1): scores = [] nSplits = 3 model, logL = None, None if(len(self.sequences) < nSplits): break splitMethod = KFold(random_state=self.random_state, n_splits=nSplits) for cv_train_ids, cv_test_ids in splitMethod.split(self.sequences): X_train, lengths_train = combine_sequences(cv_train_ids, self.sequences) X_test, lengths_test = combine_sequences(cv_test_ids, self.sequences) try: model = GaussianHMM(nComponents=n_components, covariance_type="diag", n_iter=1000, random_state=inst.random_state, verbose=False).fit(X_train, lengths_train) logL = model.score(X_test, lengths_test) scores.append(logL) except Exception as e: break av = np.average(scores) if len(scores) > 0 else float("-inf") if av > bestScore: bestScore, bestModel = av, model return bestModel if bestModel is not None else self.base_model(self.n_constant)
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = None best_score = float("inf") for num_states in range(self.min_n_components, self.max_n_components): try: hmmmodel = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) logL = hmmmodel.score(self.X, self.lengths) initialStateProbs = num_states transitionProbs = num_states * (num_states - 1) emissionProbs = len(np.diagonal(hmmmodel.means_)) + len( np.diagonal(hmmmodel.covars_)) p = initialStateProbs + transitionProbs + emissionProbs BIC_Score = -2 * logL + p * np.log(len(self.X)) if BIC_Score < best_score: best_score = BIC_Score best_model = hmmmodel except: continue return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = self.base_model(self.n_constant) best_LogL = float('-inf') if len(self.sequences) < 3: return best_model split_method = KFold() for n in range(self.min_n_components, self.max_n_components + 1): try: folds = 0 total_LogL = 0 for cv_train_idx, cv_test_idx in split_method.split( self.sequences): folds += 1 X_train, X_lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, X_lengths_test = combine_sequences( cv_test_idx, self.sequences) model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, X_lengths_train) total_LogL += model.score(X_test, X_lengths_test) average_LogL = total_LogL / folds if average_LogL > best_LogL: best_model = model best_LogL = average_LogL except: continue return best_model
def cv_loop(num_components): """ CV loop helper function """ logLs = [] # I thought I needed to do something like this (as it was failing for FISH) but I confirmed it using the forums: https://discussions.udacity.com/t/selectorcv-fails-to-train-fish/338796 split_method = KFold(n_splits=min(3, len(self.sequences))) # for each fold for cv_train_idx, cv_test_idx in split_method.split( self.sequences): try: # we get X and lengths for both train and test set X_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) # we train the model current_model = GaussianHMM(n_components=num_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, lengths_train) # and we append the logL to our list logLs.append(current_model.score(X_test, lengths_test)) except: # copied from the function above (base_model) if self.verbose: print( "failure on {} with {} states, continuing".format( self.this_word, num_components)) continue # if we found at least one logL we return the average if len(logLs) > 0: return (sum(logLs) / len(logLs)) else: return float('-Inf')
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) best = (None, float('inf')) # Tuple (model, BIC score) for n in range(self.min_n_components, self.max_n_components): try: # Train HMM model = GaussianHMM(n_components=n, n_iter=1000, random_state=self.random_state).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) logN = np.log(len((self.lengths))) # N is number of data points p = n ** 2 + 2 * n * model.n_features - 1 # p is number of parameters # Calculate BIC (Bayesian Information Criteria) score score = -2 * logL + p * logN # If BIC score is better than previous best, store model and the score if score < best[1]: best = model, score except: pass return best[0]
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) word_sequences = self.sequences split_method = KFold(n_splits=max(2, min(5, len(word_sequences)))) max_score = None best_model = None for num_states in range(self.min_n_components, self.max_n_components + 1): try: scores_list = [] for cv_train_idx, cv_test_idx in split_method.split( word_sequences): train_data, train_length = combine_sequences( cv_train_idx, word_sequences) test_data, test_length = combine_sequences( cv_test_idx, word_sequences) hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_data, train_length) score = hmm_model.score(test_data, test_length) scores_list.append(score) tmp_score = np.mean(scores_list) if (max_score == None or tmp_score > max_score): max_score = tmp_score best_model = hmm_model except: continue return best_model
def run_model(self, n_components, X_train, lengths_train, X_test, lengths_test): model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_train, lengths_train) logL = model.score(X_test, lengths_test) return logL
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('-inf') best_model = None sum_score = 0.0 counter = 0.0 if len(self.sequences) >= 3: n_splits = min(3, len(self.sequences)) splits = KFold(n_splits) for n in range(self.min_n_components, self.max_n_components+1): try: for train_index, test_index in splits.split(self.sequences): # used forum code to get train/test X,Lengths respectively: https://discussions.udacity.com/t/selectorcv-crashes/400125 train_X, train_lengths = combine_sequences(train_index, self.sequences) test_X, test_lengths = combine_sequences(test_index, self.sequences) model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_lengths) score = model.score(test_X, test_lengths) sum_score += score counter += 1 # used average score from udacity forum: https://discussions.udacity.com/t/my-selectorcv-class/349110 average_score = sum_score / counter if average_score > best_score: best_score = average_score best_model = model except: continue # for models with length less than 3 else: best_score_1 = float('inf') best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 p = n ** 2 + 2 * n * len(self.X[0]) - 1 N = len(self.X) score_1 = -2 * logL + p * np.log(N) if score_1 < best_score_1: best_score_1 = score_1 best_model = model except: continue return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) max_score = None max_model = None for n in range(self.min_n_components, self.max_n_components + 1): try: all_score = 0.0 qty = 0 final_model = None if (len(self.sequences) >= 2): # Generate K folds folds = min(len(self.sequences),3) split_method = KFold(shuffle=True, n_splits=folds) parts = split_method.split(self.sequences) for cv_train_idx, cv_test_idx in parts: # Kfold information for train X_train, lengths_train = np.asarray(combine_sequences(cv_train_idx, self.sequences)) # Kfold information for test X_test, lengths_test = np.asarray(combine_sequences(cv_test_idx, self.sequences)) # Fit model with train data model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_train, lengths_train) # Get score using test data all_score = all_score+model.score(X_test,lengths_test) qty = qty+1 # Calculate score score = all_score / qty else: # cant be fold final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) score = model.score(self.X, self.lengths) # Keep model with best score if max_score is None or max_score < score: max_score = score if final_model is None: final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) max_model = final_model except: pass return max_model
class HMM: __slots__ = [ "model" ] def __init__(self): pass def draw(self, data): figure() plot(range(len(data)),data,alpha=0.8,color='red') show() def train(self, data, n_components): print("Training Data: %s" % data) self.data = data self.model = GaussianHMM(n_components, algorithm='viterbi', covariance_type='diag') X = np.reshape(data, (len(data),1)) self.model = self.model.fit([X]) self.hidden_states = self.model.predict(X) print("Sequence of States: " % self.hidden_states) def eval(self, obs): print("Testing Data: %s" % obs) X = np.reshape(obs, (len(obs),1)) print("Eval: %s" % str(self.model.score(X))) def plot(self): fig = figure(facecolor="white") ax = fig.add_subplot(111) for i in range(self.model.n_components): # use fancy indexing to plot data in each state idx = (self.hidden_states == i) ax.plot(np.array(range(len(self.data)))[idx], np.array(self.data)[idx], '.', label="State %d" % (i+1)) ax.legend() show()
class HmmClassifier(): def __init__(self, referenceSeqs, inputSeq): self.referenceSeqs = referenceSeqs self.inputSeq = inputSeq # feel free to change this model self.model = GaussianHMM(n_components=2, covariance_type="full", n_iter=2000) def predict(self): probs = [] for referenceSeq in self.referenceSeqs: #print "reference: {}".format(referenceSeq) self.model.fit(referenceSeq) hidden_states = self.model.predict(referenceSeq) prob = self.model.score(self.inputSeq) probs.append(prob) # return the index of the max prob return probs.index(max(probs))
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV # raise NotImplementedError record = float("-inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) if len(self.sequences) == 1: return hmm_model elif len(self.sequences) == 2: split_method = KFold(n_splits=2) #self.max_n_components = 3 else: split_method = KFold(n_splits=3,random_state=self.random_state) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) logL = 0 cnt = 0 for cv_train_idx, cv_test_idx in split_method.split(self.sequences): #print("Train fold indices:{} Test fold indices:{}".format(cv_train_idx, cv_test_idx)) # view indices of the folds X, lengths = combine_sequences(cv_train_idx,self.sequences) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(X, lengths) X, lengths = combine_sequences(cv_test_idx,self.sequences) logL += model.score(X, lengths) except: continue #print("failure on {} with {} states".format(self.this_word, num)) if cnt> 0 and logL/cnt > record: record = logL hmm_model = model return hmm_model
class HMM(object): def __init__(self): def setup(): def load_patterns(file): patterns = None sizes = np.zeros(len(words)) counter = 0 f = open(file, 'rb') data = f.readlines() stack = [] for i in range(np.shape(data)[0]): data2 = map(float, data[i].split()) data2 = np.reshape(data2, (1, -1)) if i == 0: stack = data2 else: stack = np.vstack((stack, data2)) f.close() sizes[counter] = np.shape(stack)[0] counter += 1 if patterns is None: patterns = stack else: patterns = np.vstack((patterns, stack)) return patterns hidden = 1 self.go_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('go.bin')) self.back_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('back.bin')) self.right_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('right.bin')) self.left_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('left.bin')) self.stop_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit( load_patterns('stop.bin')) setup() self.number_of_components = 5 def match(self, pattern): probabilities = np.zeros(5) probabilities[0] = self.go_model.score(np.reshape(pattern, (1, -1))) probabilities[1] = self.back_model.score(np.reshape(pattern, (1, -1))) probabilities[2] = self.right_model.score(np.reshape(pattern, (1, -1))) probabilities[3] = self.left_model.score(np.reshape(pattern, (1, -1))) probabilities[4] = self.stop_model.score(np.reshape(pattern, (1, -1))) probabilities = abs(probabilities) index, error = min(enumerate(probabilities), key=lambda x: x[1]) if error < 9500: if index == 0: return 0 elif index == 1: return 1 elif index == 2: return 2 elif index == 3: return 3 else: return 4 return -1
tot_words = len(correct_answers) right = 0.0 threshold = 1.5 for i in xrange(tot_words): try: (rate,sig) = wav.read('Test/'+test_folder+"/word" + str(i) + ".wav") features = get_features(sig) word_len = len(features) ans = -1 j = -1 max_ans = -1e9 for model in models: j = j+1 if math.fabs(word_len - means[j]) <= threshold * std_devs[j]: temp = model.score(features) if temp>max_ans: max_ans = temp ans = j #print max_ans print str(i+1)+". Detected word: "+spoken[ans] if spoken[ans] == correct_answers[i][0]: right = right + 1 except: break cur_accuracy = (right/tot_words)*100 print "Accuracy = "+str(cur_accuracy)+"%" if cur_accuracy > accuracy:
spx_price = spx_price['Close'] spx_price.name = 'SPX Index' spx_ret = spx_price.shift(1)/ spx_price[1:] - 1 spx_ret.dropna(inplace=True) #spx_ret = spx_ret * 1000.0 rets = np.column_stack([spx_ret]) # Create the Gaussian Hidden markov Model and fit it # to the SPY returns data, outputting a score hmm_model = GaussianHMM( n_components=3, # number of states covariance_type="full", # full covariance matrix vs diagonal n_iter=1000 # number of iterations ).fit(rets) print("Model Score:", hmm_model.score(rets)) # Plot the in sample hidden states closing values # Predict the hidden states array hidden_states = hmm_model.predict(rets) print('Percentage of hidden state 1 = %f' % (sum(hidden_states)/len(hidden_states))) print("Transition matrix") print(hmm_model.transmat_) print("Means and vars of each hidden state") for i in range(hmm_model.n_components): # 0 is down, 1 is up print("{0}th hidden state".format(i)) print("mean = ", hmm_model.means_[i]) print("var = ", np.diag(hmm_model.covars_[i]))
for grp, files in filesorter.groupby('group'): list_of_datasets = [] lengths = [] for fn in files.index: try: fbf = pd.read_pickle(files.ix[fn]['filepath']) x_ = np.column_stack(fbf[ i +'_smoothed'] for i in parameters) list_of_datasets.append(x_) lengths.append(len(x_)) except: print 'failed to complete: ', grp, fn X = np.concatenate(list_of_datasets) np.save(DATADIR + 'HMM_JAR/' + grp +'_X.npy', X) np.save(DATADIR + 'HMM_JAR/' + grp +'_lengths.npy', lengths) model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000).fit(X, lengths) likelihoods.append(model.score(X)) joblib.dump(model, DATADIR + 'HMM_JAR/' + grp +'_model.pkl') groups.append(grp) # MAKE ONE MODEL PER TREATMENT: for grp in treatments: list_of_datasets = [] for data in glob.glob( DATADIR + 'HMM_JAR/*'+ grp +'_X.npy'): list_of_datasets.append(np.load(data)) X = np.concatenate(list_of_datasets) lengths = [] for l in glob.glob( DATADIR + 'HMM_JAR/*'+ grp +'_lengths.npy'): lengths.append(np.load(l)) lengths = np.concatenate(lengths)