def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection based on DIC scores # warnings.filterwarnings("ignore", category=RuntimeWarning) # Declare some initial values for variables DIC = -float('Inf') DIC_Temp = -float('Inf') best_model = None fitted_model = None # Iterate through each model within the defined range of states and calculate DIC score. # The first part of calculation is similar to BIC algorithm # However to calculate second part of this selector, we need additionaly iterate to find probability # of not occuring concrete word (so cumulative probability of all other words excluding this word) for Nb_states in range(self.min_n_components, self.max_n_components + 1): M = 1. temp_model = None try: fitted_model = GaussianHMM(n_components=Nb_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( self.X, self.lengths) DIC_first_Log = fitted_model.score(self.X, self.lengths) temp_model = fitted_model except: continue TotalSumLog = 0 for hword in self.hwords.keys(): if hword != self.this_word: other_X, other_length = self.hwords[hword] try: SumLogOtherWord = temp_model.score( other_X, other_length) M += 1 except: SumLogOtherWord = 0 TotalSumLog += SumLogOtherWord #Here we need take into consideration that according to formula, if in second part of algoritm # is no probability other words, to avoid divide zero situation (when M=1), need to check this situation # and make DIC equals to log(P(X(i)) if M == 1: M = floaT('inf') DIC_temp = DIC_first_Log - (1 / (M - 1)) * TotalSumLog * 1. # To avoid usage of dictionary with stored DIC parameters, we check after each iteration the current # value of selector and if it is better we found before, we assign it to the variable and store best model, # we found so far . if DIC_temp > DIC: DIC = DIC_temp best_model = temp_model return best_model
def hmmmodel(seq): model = GaussianHMM(n_components=2, n_iter=1000) model.fit(seq) hidden_states = model.predict(seq) return model, hidden_states
from hmmlearn.hmm import GaussianHMM from convert_to_timeseries import convert_data_to_timeseries # Load data from input file input_file = 'data_hmm.txt' data = np.loadtxt(input_file, delimiter=',') # Arrange data for training X = np.column_stack([data[:, 2]]) # Create and train Gaussian HMM print "\nTraining HMM...." num_components = 4 model = GaussianHMM(n_components=num_components, covariance_type="diag", n_iter=1000) model.fit(X) # Predict the hidden states of HMM hidden_states = model.predict(X) print "\nMeans and variances of hidden states:" for i in range(model.n_components): print "\nHidden state", i + 1 print "Mean =", round(model.means_[i][0], 3) print "Variance =", round(np.diag(model.covars_[i])[0], 3) # Generate data using model num_samples = 1000 samples, _ = model.sample(num_samples)
a = line.split() b = a[9:10] # 这是选取需要读取的位数 train_date.append(b) # 将其添加在列表之中 line = f.readline() f.close() train_y = torch.unsqueeze(trainData[:, 8], 1) # print(train_x) # print(train_y) # 建立神经网络,该网络有两个隐藏层,激活函数使用ReLU() if __name__ == "__main__": # 迭代次数 train_x = features_init(train_x) model = GaussianHMM(n_components=3, covariance_type='diag', n_iter=1000).fit(train_x) # import pickle # output = open('./model/modelWorkTime_HMMOPS.pth', 'wb') # s = pickle.dump(model, output) # output.close() # test_x = Variable(torch.FloatTensor(predictData)) # # 为了归一化测试数据,需要载入历史数据 # trainData = txt_to_numpy(args.train_dir, batch_n, input_data + 1) # # trainData = np.load(args.train_dir) # trainData = torch.tensor(trainData, dtype=torch.float32) # trainData = Variable(trainData, requires_grad=False) # train_x = trainData[:, :8] # import pickle # # input = open("./model/modelWorkTime_HMMOPS.pth", 'rb')
print(x) import matplotlib.pyplot as plt data = np.loadtxt('datasets/data_1D.txt', delimiter=',') X = np.column_stack([data[:, 2]]) plt.plot(np.arange(X.shape[0]), X[:, 0], c='black') plt.title('Training data') plt.show() from hmmlearn.hmm import GaussianHMM num_components = 10 hmm = GaussianHMM(n_components=num_components, covariance_type='diag', n_iter=1000) print('Training the Hidden Markov Model...') hmm.fit(X) print('Means and variances:') for i in range(hmm.n_components): print('\nHidden state', i + 1) print('Mean =', round(hmm.means_[i][0], 2)) print('Variance =', round(np.diag(hmm.covars_[i])[0], 2)) num_samples = 1200 generated_data, _ = hmm.sample(num_samples) plt.plot(np.arange(num_samples), generated_data[:, 0], c='black') plt.title('Generated data')
def __init__(self, n_components): self.model = GaussianHMM(n_components=n_components, algorithm='map', covariance_type='diag')
# # plt.subplot(411) # # plt.plot(data_T[0]) # # plt.subplot(412) # # plt.plot(data_T[1]) # # plt.subplot(413) # # plt.plot(data_T[2]) # # plt.subplot(414) # # plt.plot(data_T[3]) # # plt.show() with warnings.catch_warnings(): warnings.simplefilter("ignore") leng = [] for i in range(int(len(data) / 20)): leng.append(20) print(len(leng)) model = GaussianHMM(n_components=n, covariance_type="diag").fit(data, lengths=leng) joblib.dump(model, modelname) # model=joblib.load("a1/model3.pkl") f = model.n_features a = model.transmat_ pi = model.startprob_ mean = model.means_ cov = model.covars_ print(f) print(a) print(pi) print(mean) print(cov) with warnings.catch_warnings(): warnings.simplefilter("ignore")
x3 = np.concatenate((x3, data), axis=0) x3_lengths.append(data.shape[0]) for file in train_data4: data = np.loadtxt(file)[:, usedJoints] - 100 x4 = np.concatenate((x4, data), axis=0) x4_lengths.append(data.shape[0]) for file in train_data5: data = np.loadtxt(file)[:, usedJoints] - 100 x5 = np.concatenate((x5, data), axis=0) x5_lengths.append(data.shape[0]) #train Gaussian HMMs & define parameters for each gesture model1 = GaussianHMM(n_components=1, covariance_type='full', verbose=False).fit(x1[1:], x1_lengths) #4 model2 = GaussianHMM(n_components=3, covariance_type='diag', verbose=False).fit(x2[1:], x2_lengths) #3 model3 = GaussianHMM(n_components=4, covariance_type='diag', verbose=False).fit(x3[1:], x3_lengths) #4 model4 = GaussianHMM(n_components=1, covariance_type='full', verbose=False).fit(x4[1:], x4_lengths) #4 model5 = GaussianHMM(n_components=1, covariance_type='full', verbose=False).fit(x5[1:], x5_lengths) #9 #load test data
# Hides deprecation warnings for sklearn warnings.filterwarnings('ignore') csv_filepath = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/data/VOO.csv" pickle_path = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/model/hmm_model_voo.pkl" # csv_filepath = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/data/UPRO.csv" # pickle_path = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/model/hmm_model_upro.pkl" # Training period: April 30th, 2011 to April 30th, 2019 start_date = datetime.datetime(2011, 4, 29) end_date = datetime.datetime(2019, 4, 29) asset = obtain_prices_df(csv_filepath, start_date, end_date) rets = np.column_stack([asset["Returns"]]) # Shows the histogram plot for the returns _ = plt.hist(rets) plt.show() # Create the Gaussian Hidden Markov Model and fit it # to the asset returns data, outputting a score hmm_model = GaussianHMM(n_components=2, covariance_type="full", n_iter=1000).fit(rets) print('Model Score: ', hmm_model.score(rets)) # Plot the in-sample hidden states closing values plot_in_sample_hidden_states(hmm_model, asset) print('Picking HMM model...') pickle.dump(hmm_model, open(pickle_path, "wb")) print("...HMM model pickled.")
# 06-07 # 08-09 # 09-10 # 10-11 # 12-13 # 13-14 # 14-15 "Número de estados deseados" Nc = 3 " Se entrena el HMM y se estima la serie de estados probables" wind_leap = wind.reshape(-1, 1) model = GaussianHMM(n_components=Nc, covariance_type="diag", n_iter=1000).fit(wind_leap) hidden_states = model.predict(wind_leap) " Matriz de estados, donde cada fila es un año de estados" state_matrix = np.reshape(hidden_states, (27, 120)) state_matrix = state_matrix + 1 state_matrix[state_matrix == 3] = 11 state_matrix[state_matrix == 1] = 33 # state_matrix[state_matrix == 2] = 55 state_matrix[state_matrix == 33] = 3 state_matrix[state_matrix == 11] = 1 # state_matrix[state_matrix == 55] = 5 # Dos estados
return 0 start = datetime.datetime(2013, 1, 1) end = pd.datetime.today() df = web.DataReader("GOOGL", 'google', start, end) datestart = '20130101' dateend = '20160101' # dates, close_v, volume_v, high_v, open_v, low_v = get_value_by_dates(df, datestart, dateend) # X = np.column_stack([close_v, volume_v, high_v, open_v, low_v]) X, dates, close_v, volume_v, high_v, open_v, low_v = get_value_by_dates( df, datestart, dateend) model = GaussianHMM(n_components=100, covariance_type="tied", n_iter=100, init_params='m', verbose=True).fit(X) hidden_states = model.predict(X) print(hidden_states) # print("Transition matrix") # print(model.transmat_) # print() print("Means and vars of each hidden state") for i in range(model.n_components): print("{0}th hidden state".format(i)) print("mean = ", model.means_[i]) print("var = ", np.diag(model.covars_[i])) print()
def clustering_for_features_selection(start_date, end_date): all_features_df, gold_price = create_all_features(start_date, end_date, is_training=False) n_components = 3 # TODO tuning input_days = 3 # TODO tuning n_clusters_list = list(range(10, len(all_features_df.columns), 50)) print(n_clusters_list) results_file = open('features/clustering_features_selection_results.txt', 'w', encoding='utf-8') mae_results = [] for n_cluster in n_clusters_list: training_x, test_x, past_price, target_price, selected_features_name_list = make_features_for_tuning( all_features_df, gold_price, n_cluster, input_days) model = GaussianHMM(n_components) model.fit(training_x) predict = validate_model(model, test_x, past_price) res_mae = mean_absolute_error(target_price, predict) # print predicted_prices # print('past price : {}'.format(np.array(past_price))) # print('predicted price : {}'.format(predict)) # print('real price : {}'.format(np.array(target_price))) # print() # print('mae :', mean_absolute_error(target_price, predict)) if not mae_results or min(mae_results) > res_mae: # Save features with open('features/clustering_selected_features.txt', 'w', encoding='utf-8') as f: f.write('{}, {}\n'.format(n_cluster, res_mae)) f.write(', '.join(selected_features_name_list)) f.close() # Save model # TODO: fix pickle file name filename = 'model_kmeans_clustering_best.pkl' pickle.dump(model, open(filename, 'wb')) print('saved {}'.format(filename)) mae_results.append(res_mae) print('mae for {} clusters with {}: {}'.format( n_cluster, len(selected_features_name_list), res_mae)) results_file.write('mae for {} clusters: {}\n'.format( n_cluster, res_mae)) plt.plot(n_clusters_list, mae_results, 'b-') plt.grid(which='both') plt.xticks(list(range(10, max(n_clusters_list), 50))) plt.yticks(list(range(0, int(max(mae_results)), 5))) # plt.axis([0, max(n_clusters_list), 0, max(mae_results)]) plt.ylabel('MAE') plt.xlabel('number of clusters') plt.show() plt.savefig('features/clustering_features_selection_results.png')
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) best_model = self.base_model(self.min_n_components) word_sequences = self.sequences l = len(word_sequences) if l > 2: split_method = KFold() best_value = -10000000 try: for i in range(self.min_n_components, self.max_n_components + 1): average_log = 0 k = 0 for cv_train_idx, cv_test_idx in split_method.split( word_sequences): X_train, lengths_train = combine_sequences( cv_train_idx, word_sequences) X_test, lengths_test = combine_sequences( cv_test_idx, word_sequences) model = GaussianHMM(n_components=i, n_iter=1000, covariance_type="diag", random_state=self.random_state, verbose=False).fit( X_train, lengths_train) average_log += model.score(X_test, lengths_test) k += 1 average_log /= k if best_value < average_log: best_value = average_log best_model = self.base_model(i) except: if self.verbose: print("failure on {} with {} states".format( self.this_word, i)) return best_model else: l = len(self.sequences) best_value = -10000000 if l == 2: X_train, lengths_train = combine_sequences([0], word_sequences) X_test, lengths_test = combine_sequences([1], word_sequences) try: for i in range(self.min_n_components, self.max_n_components + 1): model = GaussianHMM(n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, lengths_train) model2 = GaussianHMM(n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_test, lengths_test) average_log = ( model.score(X_test, lengths_test) + model2.score(X_train, lengths_train)) / 2 if average_log > best_value: best_value = average_log best_model = GaussianHMM( n_components=i, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) except: if self.verbose: print("failure on {} with {} states".format( self.this_word, i)) return best_model if l == 1: for i in range(self.min_n_components, self.max_n_components + 1): try: model = self.base_model(i) average_log = model.score(self.X, self.lengths) if best_value < average_log: best_model = model best_value = average_log except: if self.verbose: print("failure on {} with {} states".format( self.this_word, i)) return best_model return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV # warnings.filterwarnings("ignore", category=RuntimeWarning) #If we dont have at least two samples, it is impossible to use this method if len(self.lengths) < 2: return print( "Number of samples is less than minimal number of kfolds") # Try to remain default number of folds (3), but if only two samples, use two folds split_method = KFold(n_splits=min(len(self.lengths), 3)) # Declare some initial values for variables BestAvgLL = -float('Inf') best_model = None temp_model = None # Iterate through each model within the defined range of states and calculate average LogL. # To do it, we split dataset using K-fold splitting method and get training and testing sets. # Then we try to train model on training set and calculate LogL on testing dataset. # Because we have several combinations of train/test dataset, we calculate LogL for each combo and find #average LogL for each state. for Nb_states in range(self.min_n_components, self.max_n_components + 1): TotalLL = 0 CountLL = 1 fitted_model = None for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X_train, X_test = [], [] for ii in cv_train_idx: X_train += self.sequences[ii] for yy in cv_test_idx: X_test += self.sequences[yy] X_train, X_test = np.array(X_train), np.array(X_test) len_train, len_test = np.array( self.lengths)[cv_train_idx], np.array( self.lengths)[cv_test_idx] try: fitted_model = GaussianHMM(n_components=Nb_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, len_train) LogL = fitted_model.score(X_test, len_test) CountLL += 1 except: LogL = 0 TotalLL += LogL AvgTempLL = TotalLL / (CountLL * 1.0) # To avoid usage of dictionary with stored AvgLogL parameters for each state, # we check after each iteration the current value of selector and if it is better we found before, # we assign it to the variable and store best model, we found so far . if AvgTempLL > BestAvgLL: BestAvgLL = AvgTempLL best_model = fitted_model return best_model
matrix2 = [] lengths = [] lengths2 = [] a = 0 for beat in beats: lengths.append(len(beat)) matrix = matrix + beat for beat_loudness in beat_loudnesses: lengths2.append(len(beat_loudness)) matrix2 = matrix2 + beat_loudness print("fitting to HMM and decoding ...", end="") model = GaussianHMM(n_components=1, covariance_type="spherical", n_iter=1000).fit(np.atleast_2d(matrix).T, lengths) f = open( '/home/ysj/Downloads/어쿠스틱 콜라보-그대와 나, 설레임 (Feat. 소울맨)_percussive_beat.csv', 'r') csvReader = csv.reader(f) test = [] beat = 0.0 for row in csvReader: test.append(float(row[0]) - float(beat)) beat = row[0] f.close()
ma20 = ma20[1:] vma5 = vma5[1:] vma10 = vma10[1:] vma20 = vma20[1:] X = np.column_stack([diff, open, high, close, low, volume, ma5, ma10, ma20, vma5, vma10, vma20]) print("观测值:") print(X) diff_v = diff.reshape(-1, 1) n = 4 model = GaussianHMM(n_components=n, n_iter=1000, covariance_type='full', tol=0.0001) model = model.fit(X) print("样本量:") print(X.shape) print("给定的隐藏特征数目:") print(n) print("初始的隐藏状态概率π:") print(model.startprob_) print("状态转移矩阵A参数:") print(model.transmat_) print("估计均值:") print(model.means_) print("估计方差:") print(model.covars_)
normed=1, facecolor='green', alpha=0.75) plt.show() # Observation sequences matrix A = np.column_stack([logDel, logRet_5, logVol_5]) # Rescaled observation sequences matrix rescaled_A = np.column_stack( [rescaled_boxcox_logDel, rescaled_logRet_5, rescaled_logVol_5]) # HMM modeling based on raw observation sequences model = GaussianHMM(n_components=3, covariance_type="full", n_iter=2000).fit([A]) hidden_states = model.predict(A) hidden_states # Plot the hidden states plt.figure(figsize=(25, 18)) for i in range(model.n_components): pos = (hidden_states == i) plt.plot_date(Date[pos], close[pos], 'o', label='hidden state %d' % i, lw=2) plt.legend(loc="left") # Trading test according to the hidden states
def hmm_weight(df, data_raw, day, n_components, plot=False): tr_start, tr_end, te_start, te_end = train_test(day, df) col_list = ['update_date', 'open', 'high', 'low', 'close'] df = df.loc[:, col_list] df = df.dropna(axis=0) data_raw = data_raw.loc[:, col_list] data_raw = data_raw.dropna(axis=0) train_df = df.loc[df['update_date'] >= tr_start, :].loc[ df['update_date'] <= tr_end, :] test_df = df.loc[df['update_date'] >= te_start, :].loc[ df['update_date'] <= te_end, :] train_close = data_raw.loc[data_raw['update_date'] >= tr_start, :].loc[ data_raw['update_date'] <= tr_end, :] test_close = data_raw.loc[data_raw['update_date'] >= te_start, :].loc[ data_raw['update_date'] <= te_end, :] if len(train_df) > 0 and len(test_df) > 0: r_5 = np.array( np.array(np.log(train_df['close'][5:])) - np.array(np.log(train_df['close'][:-5])))[:] # r_10 = np.array(np.array(np.log(train_df['close'][10:])) - np.array(np.log(train_df['close'][:-10]))) r_1 = np.array( np.array(np.log(train_df['close'][1:])) - np.array(np.log(train_df['close'][:-1])))[4:] r_range = np.array((np.array(np.log(train_df['high'])) - np.array(np.log(train_df['low']))))[5:] r_1 = np.array( map( lambda x: 0 if x == np.inf or x == -np.inf or np.isnan(x) else x, r_1)) r_5 = np.array( map( lambda x: 0 if x == np.inf or x == -np.inf or np.isnan(x) else x, r_5)) # r_10 = np.array(map(lambda x: 0 if x==np.inf or x==-np.inf or np.isnan(x) else x, r_10)) r_range = np.array( map( lambda x: 0 if x == np.inf or x == -np.inf or np.isnan(x) else x, r_range)) r_1_no_lag = list(r_1[1:]) r_1_no_lag.append(0) r_1_no_lag = np.array(r_1_no_lag) date_list = train_df['update_date'][5:] r_5_test = np.array( np.array(np.log(test_df['close'][5:])) - np.array(np.log(test_df['close'][:-5])))[:] # r_10_test = np.array(np.array(np.log(test_df['close'][10:])) - np.array(np.log(test_df['close'][:-10]))) r_1_test = np.array( np.array(np.log(test_df['close'][1:])) - np.array(np.log(test_df['close'][:-1])))[4:] r_1_test = np.array( map( lambda x: 0 if x == np.inf or x == -np.inf or np.isnan(x) else x, r_1_test)) r_5_test = np.array( map( lambda x: 0 if x == np.inf or x == -np.inf or np.isnan(x) else x, r_5_test)) # r_10_test = np.array(map(lambda x: 0 if x==np.inf or x==-np.inf or np.isnan(x) else x, r_10_test)) r_1_test_no_lag = list(r_1_test[1:]) r_1_test_no_lag.append(0) r_1_test_no_lag = np.array(r_1_test_no_lag) r_range_test = np.array( np.array(np.log(test_df['high'])) - np.array(np.log(test_df['low'])))[5:] r_range_test = np.array( map( lambda x: 0 if x == np.inf or x == -np.inf or np.isnan(x) else x, r_range_test)) date_list_test = test_df['update_date'][5:] X = np.column_stack([r_1, r_5, r_range]) X_test = np.column_stack([r_1_test, r_5_test, r_range_test]) if X.shape[0] >= n_components and X_test.shape[0] >= n_components: hmm = GaussianHMM(n_components=n_components, covariance_type='diag', n_iter=2000).fit(X) latent_states_sequence_train = hmm.predict(X) mean_return_dict = {} if plot == True: import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') plt.figure(figsize=(15, 8)) for i in range(hmm.n_components): state = (latent_states_sequence_train == i) sharpe = (np.mean(r_1_no_lag[state]) * 252 - 0.03) / ( np.std(r_1_no_lag[state]) * np.sqrt(252)) plt.plot(date_list[state], train_close['close'][state], 'o', label='latent state %d: %s' % (i, sharpe), lw=5) plt.legend() plt.grid(1) mean_return_dict[i] = sharpe plt.show() else: for i in range(hmm.n_components): state = (latent_states_sequence_train == i) mean_return_dict[i] = (np.mean(r_1_no_lag[state]) * 252 - 0.03) / (np.std(r_1_no_lag[state]) * np.sqrt(252)) latent_states_sequence_test = hmm.predict(X_test) pair = mean_return_dict.items() pair = filter(lambda x: False if np.isnan(x[1]) else True, pair) pair_sorted = sorted(pair, key=lambda x: x[1]) highest = pair_sorted[-1] lowest = pair_sorted[0] # print pair_sorted expected_return_series = map(lambda x: mean_return_dict[x], latent_states_sequence_test) expected_return_series = np.array( map(lambda x: 1 if x > 0 else -1, expected_return_series[:-1])) real_return_series = r_1_test[1:] real_return_series = np.array( map(lambda x: 1 if x > 0 else -1, real_return_series)) temp = expected_return_series - real_return_series temp = filter(lambda x: True if np.isnan(x) == False else False, temp) # acc_rate=(len(temp) - np.sum(np.abs(temp)) / 2.) / len(temp) # print acc_rate real_return_series = list(real_return_series) # print real_return_series.count(1) / float(len(real_return_series)) # print real_return_series.count(-1) / float(len(real_return_series)) # print 'time: ',np.max(date_list_test),'expected Sharpe: ',mean_return_dict[latent_states_sequence_test[-1]] prediction = pd.DataFrame() prediction['update_date'] = date_list_test prediction['state'] = latent_states_sequence_test prediction['expected_sharpe'] = prediction['state'].apply( lambda x: mean_return_dict[x]) if plot == True: sns.set_style('white') plt.figure(figsize=(8, 4)) for i in range(hmm.n_components): state = (latent_states_sequence_test == i) plt.plot(date_list_test[state], test_close['close'][state], 'o', label='latent state %d: %s' % (i, mean_return_dict[i]), lw=5) plt.grid(1) plt.legend() plt.show() else: pass if plot == True: sns.set_style('white') plt.figure(figsize=(15, 10)) # plt.subplot(2,1,1) new_frame = copy.deepcopy(prediction) new_frame.index = [new_frame['update_date']] new_frame['expected_return'] = new_frame[ 'expected_sharpe'].apply(lambda x: 30 if x > 0 else -30) test_close.index = [test_close['update_date']] test_close['close'] = test_close['close'] - 420 test_close = test_close[np.min(new_frame['update_date']):np. max(new_frame['update_date'])] plt.plot(test_close['close'], 'o-', color='red') # plt.subplot(2,1,2) plt.bar(new_frame.index, new_frame['expected_return'], align='edge', alpha=0.5, color='yellow') plt.show() return prediction, highest, lowest else: return None, None, None else: return None, None, None
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) #pdb.set_trace() # Variable to hold the best scores-and-model across CV iterations best_score = -math.inf # Initialize scikit hmm object with default parameters best_model = GaussianHMM() # Initialize values for CV split object. # FIXME This could be achieved more eloquently: rough adjustments to # allow code to account for scenarios encountered within Recognizer data. if len(self.sequences) <= 2: splits = 2 else: splits = 3 split_method = KFold(n_splits=splits) # Iterate through states: for each state cross-validate n times. # Language and structure based on the CV snippet from the notebook # The execution snippet provides man and min components. for num_hidden_states in range(self.min_n_components, self.max_n_components + 1): try: # Return index values for fold splits # FIXME should there be a random parameter here? for cv_train_idx, cv_test_idx in split_method.split( self.sequences): try: #pdf.set_trace() # To USE the index that we get from our folds we need to use the # provided function. # For training data X_train, X_train_lengths = combine_sequences( cv_train_idx, self.sequences) # For test data X_test, X_test_lengths = combine_sequences( cv_test_idx, self.sequences) # Fit the model on the fold data (training) and current number # of states. # Note that the number of iterations use here is a carry over # from the notebook. model = GaussianHMM(n_components=num_hidden_states, n_iter=1000) model.fit(X_train, X_train_lengths) # Return score on the test data logL = model.score(X_test, X_test_lengths) # Control flow to test for high-scores if logL >= best_score: best_model, best_score = model, logL except: continue except: continue return best_model
import numpy as np from hmmlearn.hmm import GaussianHMM action_kfs = np.load('/home/user/Desktop/action_kf.npy') for s in [2, 3, 4, 5]: hmm = GaussianHMM(s) hmm.fit(action_kfs[:, :, 1:].reshape(-1, 7), [6] * 3) print hmm.score(action_kfs[:, :, 1:].reshape(-1, 7), [6] * 3) print[np.linalg.norm(c) for c in hmm.covars_]
tradeDate = pd.to_datetime(data['tradeDate'][5:]) #日期列表 volume = data['turnoverVol'][5:] #2 成交量数据 closeIndex = data['closeIndex'] # 3 收盘价数据 deltaIndex = np.log(np.array(data['highestIndex'])) - np.log( np.array(data['lowestIndex'])) #3 当日对数高低价差 deltaIndex = deltaIndex[5:] logReturn1 = np.array(np.diff(np.log(closeIndex))) #4 对数收益率 logReturn1 = logReturn1[4:] logReturn5 = np.log(np.array(closeIndex[5:])) - np.log( np.array(closeIndex[:-5])) # 5日 对数收益差 logReturnFst = np.array(np.diff(np.log(tradeVal['tradeVal'])))[4:] closeIndex = closeIndex[5:] X = np.column_stack([logReturn1, logReturn5, deltaIndex, volume, logReturnFst]) # 将几个array合成一个2Darray # Make an HMM instance and execute fit model = GaussianHMM(n_components=3, covariance_type="diag", n_iter=1000).fit([X]) # Predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print hidden_states res = pd.DataFrame({ 'tradeDate': tradeDate, 'logReturn1': logReturn1, 'logReturn5': logReturn5, 'volume': volume, 'hidden_states': hidden_states }).set_index('tradeDate') for i in range(model.n_components): idx = (hidden_states == i) idx = np.append(0, idx[:-1]) #获得状态结果后第二天进行买入操作 #fast factor backtest df = res.logReturn1
x1 = np.zeros((1, dimensions)) x1_lengths = [] test_data = sorted(glob.glob(path1)) #!!!!!!!!!! select path !!!!!!!!! for i in range(16): data = np.loadtxt(test_data[i], delimiter=' ')[:, usedJoints] - 100 x1_lengths = [] x1_lengths.append(data.shape[0]) #different topology results to different results and needs different states configuration model1 = GaussianHMM( n_components=states, covariance_type='diag', ).fit(data, x1_lengths) score = 0 #loop for finding mean of the log-likelihoods in each iteration for file in test_data: data2 = np.loadtxt(file, delimiter=' ')[:, usedJoints] - 100 score_mode1 = model1.score(data2) score += score_mode1 / 16 scoreList2.append(score) winner_prob = max(scoreList2) index = scoreList2.index(max(scoreList2)) best_iter = test_data[index] stateList.append(states) winnerList.append(best_iter)
if mus[0] > mus[1]: mus = np.flipud(mus) sigmas = np.flipud(sigmas) P = np.fliplr(np.flipud(P)) hidden_states = 1 - hidden_states return hidden_states, mus, sigmas, P, logProb, samples ''' # %% Q = data.iloc[10, 6] # hidden_states, mus, sigmas, P, logProb, samples = fitHMM(Q, 100) model = GaussianHMM(n_components=4, n_iter=500).fit(np.reshape(Q, [len(Q), 1])) hidden_states = model.predict(np.reshape(Q, [len(Q), 1])) # find parameters of Gaussian HMM mus1 = np.array(model.means_) sigmas = np.array( np.sqrt( np.array([ np.diag(model.covars_[0]), np.diag(model.covars_[1]), np.diag(model.covars_[2]), np.diag(model.covars_[3]) ]))) P = np.array(model.transmat_) # %%
import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) from hmmlearn.hmm import GaussianHMM import numpy as np #samples: X = np.array([[-1.03573482, -1.03573482], [6.62721065, 11.62721065], [3.19196949, 8.19196949], [0.38798214, 0.38798214], [2.56845104, 7.56845104], [5.03699793, 10.03699793], [5.87873937, 10.87873937], [4.27000819, -1.72999181], [4.02692237, -1.97307763], [5.7222677, 10.7222677]]) # Trainning a new model over samples: model = GaussianHMM(n_components=3, covariance_type="diag").fit(X) # Create a new copy of the trained model: new_model = GaussianHMM(n_components=3, covariance_type="diag") new_model.startprob_ = model.startprob_ new_model.transmat_ = model.transmat_ new_model.means_ = model.means_ m = model._covars_ n = model.covars_ p = model.get_params() new_model.covars_ = model._covars_ # Predict from X: X_N = new_model.predict(X) print(X_N)
def MyGaussianHMM(): from hmmlearn.hmm import GaussianHMM df = pd.read_csv( "/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv", header=-1) df.head() X = np.array(df.iloc[:, 0:5]) # 一、未知模型情况下,解决问题3 model = GaussianHMM(n_components=6, covariance_type="diag", n_iter=1000) # 方差矩阵为对角阵 """ 参数解释: covariance_type: "spherical" :主对角元素均为1,其余元素为0,独立同分布 (数据不足时,难以进行参数估计) "diag" :主对角元素不为0,其余为0 (一般情况,折中) "full" :所有元素均不为0 (数据足够进行参数估计时) """ model.fit(X) print "隐含状态为: ", model.predict(X) # 列出每一天的隐含状态 print "特征数目 %s" % model.n_features print "隐状态数目 %s" % model.n_components print "起始概率 :", model.startprob_ print "隐状态转移矩阵", model.transmat_ ## 每个隐含层对应的特征概率空间假设为正态分布,则可以得到一个model.n_components行model.n_features列的均值矩阵 print "混淆矩阵:均值部分", model.means_ print "混淆矩阵:方差部分", model.covars_ ## 绘图 hidden_states = model.predict(X) tradeDate = df.iloc[:, 5].values closeIndex = df.iloc[:, 6].values plt.figure(figsize=(15, 8)) for i in range(model.n_components): idx = (hidden_states == i) plt.plot_date(pd.to_datetime(tradeDate[idx]), closeIndex[idx], '.', label='%dth hidden state' % i, lw=1) plt.legend() plt.grid(1) plt.show() # 二、已知模型情况下,解决问题1,2 ## 沿用上述模型 ### 问题1 print "某天出现该观测的概率为: %s" % np.exp(model.score(X[0])) ### 问题2 log_prob, state = model.decode(X[:10], algorithm="viterbi") print "只根据前十天,推断出最有可能的隐含状态序列为:", state ## 自己输入模型参数 ### 一个2特征,4隐状态情况 startprob = np.array([0.6, 0.3, 0.1, 0.0]) # The transition matrix, note that there are no transitions possible # between component 1 and 3 transmat = np.array([[0.7, 0.2, 0.0, 0.1], [0.3, 0.5, 0.2, 0.0], [0.0, 0.3, 0.5, 0.2], [0.2, 0.0, 0.2, 0.6]]) # The means of each component means = np.array([[0.0, 0.0], [0.0, 11.0], [9.0, 10.0], [11.0, -1.0]]) # The covariance of each component covars = .5 * np.tile(np.identity(2), (4, 1, 1)) model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000) model2.startprob_ = startprob model2.transmat_ = transmat model2.means_ = means model2.covars_ = covars
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) kfold_splits = min(len(self.sequences), 3) #Use 3 split unless we have fewer sequences best_score = float("-inf") best_num_components = 3 word_sequences = self.sequences if kfold_splits >= 2: if kfold_splits < 3: if self.verbose: print("For {} using a kfold split of {}.".format( self.this_word, kfold_splits)) split_method = KFold(random_state=self.random_state, n_splits=kfold_splits) fold_indices = list(split_method.split(word_sequences)) else: if self.verbose: print( "Sequences for {} is less than 2. Creating model with {} states." .format(self.this_word, best_num_components)) hmm_model = self.base_model(best_num_components) return hmm_model for num_components in range(self.min_n_components, self.max_n_components + 1): scores = [] for cv_train_idx, cv_test_idx in fold_indices: train_x, train_x_lengths = combine_sequences( cv_train_idx, word_sequences) test_x, test_x_lengths = combine_sequences( cv_test_idx, word_sequences) try: hmm_model = GaussianHMM(n_components=num_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_x, train_x_lengths) log_l = hmm_model.score(test_x, test_x_lengths) scores.append(log_l) except Exception as e: if self.verbose: print("Model train error on {} with {} states".format( self.this_word, num_components)) #Discard this model. hmm_model = None break if hmm_model is None: # Stop increasing complexity since the current model failed. break if len(scores) == 1: avg = scores[0] else: avg = np.average(scores) if best_score < avg: best_score = avg best_num_components = num_components #Train the model with the full set of data model = self.base_model(best_num_components) return model
end = '2016-10-21' # 回测结束时间 ticker_name='000001' data_sz=DataAPI.MktIdxdGet(ticker=ticker_name,beginDate=start,endDate=end,field=u"",pandas="1") data=data_sz[['tradeDate','preCloseIndex','openIndex','lowestIndex','highestIndex','closeIndex','turnoverVol','turnoverValue']] print data[0:5] volume=data['turnoverVol'] close=data['closeIndex'] close2=data['preCloseIndex'] logDel = np.log(np.array(data['highestIndex'])) - np.log(np.array(data['lowestIndex'])) logRet_1 = np.array(np.diff(np.log(close2))) #这个作为后面计算收益使用 logRet_5 = np.log(np.array(close[5:])) - np.log(np.array(close[:-5]))#5日指数对数收益差 logVol_5 = np.log(np.array(volume[5:])) - np.log(np.array(volume[:-5])) logDel = logDel[5:] logRet_1 = logRet_1[4:] close = close[5:] Date = pd.to_datetime(data['tradeDate'][5:]) A = np.column_stack([logDel,logRet_5,logVol_5])#3个特征 理解成3维数据 print A[0:2] #格式注意 #build model n = 3 #6个隐藏状态 model = GaussianHMM(n_components= n, covariance_type="full", n_iter=2000).fit([A]) hidden_states = model.predict(A) hidden_states[0:10] plt.figure(figsize=(14, 6)) for i in range(model.n_components): pos = (hidden_states==i) plt.plot_date(Date[pos],close[pos],'o',label='hidden state %d'%i,lw=3) plt.legend(loc="left")
dates = np.array(apple["Close"].index.levels[1]) close_v = np.array(apple["Close"].values) volume = np.array(apple["Volume"].values)[1:] # Get the variation of the price diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] # Scale: Normalize # Input the stock return and X = np.column_stack([scale(diff), scale(volume)]) # Train Gaussian Model, Assume 4 hidden states model = GaussianHMM(n_components=4, covariance_type="full", n_iter=20) model.fit(X) # Prediction the hidden layers hidden_states = model.predict(X) # Print the parameters print("Transition matrix: ", model.transmat_) print("Means and vars of each hidden state") for i in range(4): print("{0}th hidden state".format(i)) print("mean = ", model.means_[i]) print("var = ", model.covars_[i]) print() fig, axs = plt.subplots(4, sharex=True, sharey=True)
# take diff of close value # this makes len(diff) = len(close_t) - 1 # therefore, others quantity also need to be shifted diff = close_v[1:] - close_v[:-1] dates = dates[1:] close_v = close_v[1:] # pack diff and volume for training X = np.column_stack([diff, volume]) print(X[0:3]) ############################################################################### # Run Gaussian HMM print("fitting to HMM and decoding ...", end='') # make an HMM instance and execute fit model = GaussianHMM(n_components=5, covariance_type="diag", n_iter=1000).fit(X) # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print("done\n") ############################################################################### # print trained parameters and plot print("Transition matrix") print(model.transmat_) print() print("means and vars of each hidden state") for i in range(model.n_components): print("%dth hidden state" % i)
def select(self): """ select the best model for self.this_word based on BIC score for n between self.min_n_components and self.max_n_components :return: GaussianHMM object """ warnings.filterwarnings("ignore", category=DeprecationWarning) ## Implement model selection using BIC # Initial values best_score = float("Inf") best_num_states = 2 ## Iterate through a number of states to test which is the best representation for num_states in range(self.min_n_components, self.max_n_components + 1): BIC_score = 0 try: # Catch case if n_samples > n_states # if len(self.X) < num_states: if num_states > sum(self.lengths): return None else: # print(self.this_word) # print("Number of samples {}".format(sum(self.lengths))) # print("Length of self.x[0] {}".format(len(self.X[0]))) # print("Length of self.x {}".format(len(self.X))) # print("Shape size X[0] {}".format(self.X.shape[0])) # print("Number of states {}".format(num_states)) # HMM Model building - num_states is our parameter that is found using CV hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) if self.verbose: print("model created for {} with {} states".format(self.this_word, num_states)) # Log-likelihood score logL = hmm_model.score(self.X, self.lengths) # Number of parameters used by the model - HMMs are defined by the transition probabilities, # the emission probabilities, initial probability, means and variance of distribution # Let n be the number of states and m be the number of features # Transition probabilities -> n * (n - 1) since for the last prob, we can find it through (1 - all other prob) # Initial probabilites -> n - 1 since we have n possible states to start in but last state can found via (1 - n) # Means of distributions -> n * m means as there is a distribution for each features in each state # Variance of distributions -> n * m variances as there needs to be a variance for each distribution and we are # also using normal distributions # This gives us n^2 + 2nm - 1 n_samples, n_features = self.X.shape n_params = num_states ** 2 + (2 * num_states * n_features) - 1 # BIC score BIC_score = (-2 * logL) + (n_params * math.log(n_samples)) except: if self.verbose: print("failure on {} with {} states".format(self.this_word, num_states)) return None ## Tracking best score and best number of states parameter - the lower the BIC the better if BIC_score < best_score: best_score = BIC_score best_num_states = num_states ## Build the best hmm model using all data once parameter has been finalized # print("CURRENT WORD: {}".format(self.this_word)) best_hmm_model = GaussianHMM(n_components=best_num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) if self.verbose: print("Best model created for {} with {} states".format(self.this_word, best_num_states)) return best_hmm_model