예제 #1
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on DIC scores
        # warnings.filterwarnings("ignore", category=RuntimeWarning)

        # Declare some initial values for variables
        DIC = -float('Inf')
        DIC_Temp = -float('Inf')
        best_model = None
        fitted_model = None

        # Iterate through each model within the defined range of states and calculate DIC score.
        # The first part of calculation is similar to BIC algorithm
        # However to calculate second part of this selector, we need additionaly iterate to find probability
        # of not occuring concrete word (so cumulative probability of all other words excluding this word)

        for Nb_states in range(self.min_n_components,
                               self.max_n_components + 1):
            M = 1.
            temp_model = None
            try:
                fitted_model = GaussianHMM(n_components=Nb_states,
                                           covariance_type="diag",
                                           n_iter=1000,
                                           random_state=self.random_state,
                                           verbose=False).fit(
                                               self.X, self.lengths)

                DIC_first_Log = fitted_model.score(self.X, self.lengths)
                temp_model = fitted_model
            except:
                continue

            TotalSumLog = 0

            for hword in self.hwords.keys():
                if hword != self.this_word:
                    other_X, other_length = self.hwords[hword]
                    try:
                        SumLogOtherWord = temp_model.score(
                            other_X, other_length)
                        M += 1
                    except:
                        SumLogOtherWord = 0
                    TotalSumLog += SumLogOtherWord

            #Here we need take into consideration that according to formula, if in second part of algoritm
            # is no probability other words, to avoid divide zero situation (when M=1), need to check this situation
            # and make DIC equals to log(P(X(i))
            if M == 1:
                M = floaT('inf')
            DIC_temp = DIC_first_Log - (1 / (M - 1)) * TotalSumLog * 1.

            # To avoid usage of dictionary with stored DIC parameters, we check after each iteration the current
            # value of selector and if it is better we found before, we assign it to the variable and store best model,
            # we found so far .

            if DIC_temp > DIC:
                DIC = DIC_temp
                best_model = temp_model

        return best_model
예제 #2
0
def hmmmodel(seq):
    model = GaussianHMM(n_components=2, n_iter=1000)
    model.fit(seq)
    hidden_states = model.predict(seq)
    return model, hidden_states
from hmmlearn.hmm import GaussianHMM

from convert_to_timeseries import convert_data_to_timeseries

# Load data from input file
input_file = 'data_hmm.txt'
data = np.loadtxt(input_file, delimiter=',')

# Arrange data for training
X = np.column_stack([data[:, 2]])

# Create and train Gaussian HMM
print "\nTraining HMM...."
num_components = 4
model = GaussianHMM(n_components=num_components,
                    covariance_type="diag",
                    n_iter=1000)
model.fit(X)

# Predict the hidden states of HMM
hidden_states = model.predict(X)

print "\nMeans and variances of hidden states:"
for i in range(model.n_components):
    print "\nHidden state", i + 1
    print "Mean =", round(model.means_[i][0], 3)
    print "Variance =", round(np.diag(model.covars_[i])[0], 3)

# Generate data using model
num_samples = 1000
samples, _ = model.sample(num_samples)
예제 #4
0
    a = line.split()
    b = a[9:10]  # 这是选取需要读取的位数
    train_date.append(b)  # 将其添加在列表之中
    line = f.readline()
f.close()

train_y = torch.unsqueeze(trainData[:, 8], 1)
# print(train_x)
# print(train_y)

# 建立神经网络,该网络有两个隐藏层,激活函数使用ReLU()

if __name__ == "__main__":
    # 迭代次数
    train_x = features_init(train_x)
    model = GaussianHMM(n_components=3, covariance_type='diag',
                        n_iter=1000).fit(train_x)
    # import pickle
    # output = open('./model/modelWorkTime_HMMOPS.pth', 'wb')
    # s = pickle.dump(model, output)
    # output.close()

    # test_x = Variable(torch.FloatTensor(predictData))
    # # 为了归一化测试数据,需要载入历史数据
    # trainData = txt_to_numpy(args.train_dir, batch_n, input_data + 1)
    # # trainData = np.load(args.train_dir)
    # trainData = torch.tensor(trainData, dtype=torch.float32)
    # trainData = Variable(trainData, requires_grad=False)
    # train_x = trainData[:, :8]
    # import pickle
    #
    # input = open("./model/modelWorkTime_HMMOPS.pth", 'rb')
print(x)

import matplotlib.pyplot as plt

data = np.loadtxt('datasets/data_1D.txt', delimiter=',')
X = np.column_stack([data[:, 2]])

plt.plot(np.arange(X.shape[0]), X[:, 0], c='black')
plt.title('Training data')
plt.show()

from hmmlearn.hmm import GaussianHMM

num_components = 10
hmm = GaussianHMM(n_components=num_components,
                  covariance_type='diag',
                  n_iter=1000)

print('Training the Hidden Markov Model...')
hmm.fit(X)

print('Means and variances:')
for i in range(hmm.n_components):
    print('\nHidden state', i + 1)
    print('Mean =', round(hmm.means_[i][0], 2))
    print('Variance =', round(np.diag(hmm.covars_[i])[0], 2))

num_samples = 1200
generated_data, _ = hmm.sample(num_samples)
plt.plot(np.arange(num_samples), generated_data[:, 0], c='black')
plt.title('Generated data')
예제 #6
0
 def __init__(self, n_components):
     self.model = GaussianHMM(n_components=n_components,
                              algorithm='map',
                              covariance_type='diag')
예제 #7
0
    # 	# plt.subplot(411)
    # 	# plt.plot(data_T[0])
    # 	# plt.subplot(412)
    # 	# plt.plot(data_T[1])
    # 	# plt.subplot(413)
    # 	# plt.plot(data_T[2])
    # 	# plt.subplot(414)
    # 	# plt.plot(data_T[3])
    # 	# plt.show()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            leng = []
            for i in range(int(len(data) / 20)):
                leng.append(20)
            print(len(leng))
            model = GaussianHMM(n_components=n,
                                covariance_type="diag").fit(data, lengths=leng)
            joblib.dump(model, modelname)
            # model=joblib.load("a1/model3.pkl")
            f = model.n_features
            a = model.transmat_
            pi = model.startprob_
            mean = model.means_
            cov = model.covars_
            print(f)
            print(a)
            print(pi)
            print(mean)
            print(cov)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
예제 #8
0
            x3 = np.concatenate((x3, data), axis=0)
            x3_lengths.append(data.shape[0])

        for file in train_data4:
            data = np.loadtxt(file)[:, usedJoints] - 100
            x4 = np.concatenate((x4, data), axis=0)
            x4_lengths.append(data.shape[0])

        for file in train_data5:
            data = np.loadtxt(file)[:, usedJoints] - 100
            x5 = np.concatenate((x5, data), axis=0)
            x5_lengths.append(data.shape[0])

        #train Gaussian HMMs & define parameters for each gesture
        model1 = GaussianHMM(n_components=1,
                             covariance_type='full',
                             verbose=False).fit(x1[1:], x1_lengths)  #4
        model2 = GaussianHMM(n_components=3,
                             covariance_type='diag',
                             verbose=False).fit(x2[1:], x2_lengths)  #3
        model3 = GaussianHMM(n_components=4,
                             covariance_type='diag',
                             verbose=False).fit(x3[1:], x3_lengths)  #4
        model4 = GaussianHMM(n_components=1,
                             covariance_type='full',
                             verbose=False).fit(x4[1:], x4_lengths)  #4
        model5 = GaussianHMM(n_components=1,
                             covariance_type='full',
                             verbose=False).fit(x5[1:], x5_lengths)  #9

        #load test data
    # Hides deprecation warnings for sklearn
    warnings.filterwarnings('ignore')

    csv_filepath = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/data/VOO.csv"
    pickle_path = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/model/hmm_model_voo.pkl"
    # csv_filepath = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/data/UPRO.csv"
    # pickle_path = "/Users/xuhuili/Desktop/ST451_Bayesian_Machine_Learning/Project/model/hmm_model_upro.pkl"
    # Training period: April 30th, 2011 to April 30th, 2019
    start_date = datetime.datetime(2011, 4, 29)
    end_date = datetime.datetime(2019, 4, 29)
    asset = obtain_prices_df(csv_filepath, start_date, end_date)
    rets = np.column_stack([asset["Returns"]])

    # Shows the histogram plot for the returns
    _ = plt.hist(rets)
    plt.show()

    # Create the Gaussian Hidden Markov Model and fit it
    # to the asset returns data, outputting a score
    hmm_model = GaussianHMM(n_components=2,
                            covariance_type="full",
                            n_iter=1000).fit(rets)
    print('Model Score: ', hmm_model.score(rets))

    # Plot the in-sample hidden states closing values
    plot_in_sample_hidden_states(hmm_model, asset)

    print('Picking HMM model...')
    pickle.dump(hmm_model, open(pickle_path, "wb"))
    print("...HMM model pickled.")
예제 #10
0
# 06-07

# 08-09
# 09-10
# 10-11

# 12-13
# 13-14
# 14-15

"Número de estados deseados"
Nc = 3

" Se entrena el HMM y se estima la serie de estados probables"
wind_leap = wind.reshape(-1, 1)
model = GaussianHMM(n_components=Nc, covariance_type="diag",
                    n_iter=1000).fit(wind_leap)
hidden_states = model.predict(wind_leap)

" Matriz de estados, donde cada fila es un año de estados"
state_matrix = np.reshape(hidden_states, (27, 120))
state_matrix = state_matrix + 1

state_matrix[state_matrix == 3] = 11
state_matrix[state_matrix == 1] = 33
# state_matrix[state_matrix == 2] = 55

state_matrix[state_matrix == 33] = 3
state_matrix[state_matrix == 11] = 1
# state_matrix[state_matrix == 55] = 5

# Dos estados
    return 0


start = datetime.datetime(2013, 1, 1)
end = pd.datetime.today()
df = web.DataReader("GOOGL", 'google', start, end)

datestart = '20130101'
dateend = '20160101'
# dates, close_v, volume_v, high_v, open_v, low_v = get_value_by_dates(df, datestart, dateend)
# X = np.column_stack([close_v, volume_v, high_v, open_v, low_v])
X, dates, close_v, volume_v, high_v, open_v, low_v = get_value_by_dates(
    df, datestart, dateend)
model = GaussianHMM(n_components=100,
                    covariance_type="tied",
                    n_iter=100,
                    init_params='m',
                    verbose=True).fit(X)
hidden_states = model.predict(X)
print(hidden_states)

# print("Transition matrix")
# print(model.transmat_)
# print()

print("Means and vars of each hidden state")
for i in range(model.n_components):
    print("{0}th hidden state".format(i))
    print("mean = ", model.means_[i])
    print("var = ", np.diag(model.covars_[i]))
    print()
예제 #12
0
파일: tuning.py 프로젝트: ajmal017/2020-lfd
def clustering_for_features_selection(start_date, end_date):
    all_features_df, gold_price = create_all_features(start_date,
                                                      end_date,
                                                      is_training=False)

    n_components = 3  # TODO tuning
    input_days = 3  # TODO tuning
    n_clusters_list = list(range(10, len(all_features_df.columns), 50))
    print(n_clusters_list)

    results_file = open('features/clustering_features_selection_results.txt',
                        'w',
                        encoding='utf-8')
    mae_results = []
    for n_cluster in n_clusters_list:
        training_x, test_x, past_price, target_price, selected_features_name_list = make_features_for_tuning(
            all_features_df, gold_price, n_cluster, input_days)

        model = GaussianHMM(n_components)
        model.fit(training_x)

        predict = validate_model(model, test_x, past_price)
        res_mae = mean_absolute_error(target_price, predict)

        # print predicted_prices
        # print('past price : {}'.format(np.array(past_price)))
        # print('predicted price : {}'.format(predict))
        # print('real price : {}'.format(np.array(target_price)))
        # print()
        # print('mae :', mean_absolute_error(target_price, predict))

        if not mae_results or min(mae_results) > res_mae:
            # Save features
            with open('features/clustering_selected_features.txt',
                      'w',
                      encoding='utf-8') as f:
                f.write('{}, {}\n'.format(n_cluster, res_mae))
                f.write(', '.join(selected_features_name_list))
            f.close()

            # Save model
            # TODO: fix pickle file name
            filename = 'model_kmeans_clustering_best.pkl'
            pickle.dump(model, open(filename, 'wb'))
            print('saved {}'.format(filename))

        mae_results.append(res_mae)
        print('mae for {} clusters with {}: {}'.format(
            n_cluster, len(selected_features_name_list), res_mae))
        results_file.write('mae for {} clusters: {}\n'.format(
            n_cluster, res_mae))

    plt.plot(n_clusters_list, mae_results, 'b-')
    plt.grid(which='both')
    plt.xticks(list(range(10, max(n_clusters_list), 50)))
    plt.yticks(list(range(0, int(max(mae_results)), 5)))
    # plt.axis([0, max(n_clusters_list), 0, max(mae_results)])
    plt.ylabel('MAE')
    plt.xlabel('number of clusters')
    plt.show()
    plt.savefig('features/clustering_features_selection_results.png')
예제 #13
0
 def select(self):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     warnings.filterwarnings("ignore", category=RuntimeWarning)
     best_model = self.base_model(self.min_n_components)
     word_sequences = self.sequences
     l = len(word_sequences)
     if l > 2:
         split_method = KFold()
         best_value = -10000000
         try:
             for i in range(self.min_n_components,
                            self.max_n_components + 1):
                 average_log = 0
                 k = 0
                 for cv_train_idx, cv_test_idx in split_method.split(
                         word_sequences):
                     X_train, lengths_train = combine_sequences(
                         cv_train_idx, word_sequences)
                     X_test, lengths_test = combine_sequences(
                         cv_test_idx, word_sequences)
                     model = GaussianHMM(n_components=i,
                                         n_iter=1000,
                                         covariance_type="diag",
                                         random_state=self.random_state,
                                         verbose=False).fit(
                                             X_train, lengths_train)
                     average_log += model.score(X_test, lengths_test)
                     k += 1
                 average_log /= k
                 if best_value < average_log:
                     best_value = average_log
                     best_model = self.base_model(i)
         except:
             if self.verbose:
                 print("failure on {} with {} states".format(
                     self.this_word, i))
                 return best_model
     else:
         l = len(self.sequences)
         best_value = -10000000
         if l == 2:
             X_train, lengths_train = combine_sequences([0], word_sequences)
             X_test, lengths_test = combine_sequences([1], word_sequences)
             try:
                 for i in range(self.min_n_components,
                                self.max_n_components + 1):
                     model = GaussianHMM(n_components=i,
                                         covariance_type="diag",
                                         n_iter=1000,
                                         random_state=self.random_state,
                                         verbose=False).fit(
                                             X_train, lengths_train)
                     model2 = GaussianHMM(n_components=i,
                                          covariance_type="diag",
                                          n_iter=1000,
                                          random_state=self.random_state,
                                          verbose=False).fit(
                                              X_test, lengths_test)
                     average_log = (
                         model.score(X_test, lengths_test) +
                         model2.score(X_train, lengths_train)) / 2
                     if average_log > best_value:
                         best_value = average_log
                         best_model = GaussianHMM(
                             n_components=i,
                             covariance_type="diag",
                             n_iter=1000,
                             random_state=self.random_state,
                             verbose=False).fit(self.X, self.lengths)
             except:
                 if self.verbose:
                     print("failure on {} with {} states".format(
                         self.this_word, i))
                     return best_model
         if l == 1:
             for i in range(self.min_n_components,
                            self.max_n_components + 1):
                 try:
                     model = self.base_model(i)
                     average_log = model.score(self.X, self.lengths)
                     if best_value < average_log:
                         best_model = model
                         best_value = average_log
                 except:
                     if self.verbose:
                         print("failure on {} with {} states".format(
                             self.this_word, i))
                         return best_model
     return best_model
예제 #14
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        # warnings.filterwarnings("ignore", category=RuntimeWarning)

        #If we dont have at least two samples, it is impossible to use this method
        if len(self.lengths) < 2:
            return print(
                "Number of samples is less than minimal number of kfolds")

        # Try to remain default number of folds (3), but if only two samples, use two folds
        split_method = KFold(n_splits=min(len(self.lengths), 3))

        # Declare some initial values for variables
        BestAvgLL = -float('Inf')
        best_model = None
        temp_model = None

        # Iterate through each model within the defined range of states and calculate average LogL.
        # To do it, we split dataset  using K-fold splitting method and get training and testing sets.
        # Then we try to train model on training set and calculate LogL on testing dataset.
        # Because we have several combinations of train/test dataset, we calculate LogL for each combo and find
        #average LogL for each state.

        for Nb_states in range(self.min_n_components,
                               self.max_n_components + 1):
            TotalLL = 0
            CountLL = 1
            fitted_model = None
            for cv_train_idx, cv_test_idx in split_method.split(
                    self.sequences):

                X_train, X_test = [], []
                for ii in cv_train_idx:
                    X_train += self.sequences[ii]

                for yy in cv_test_idx:
                    X_test += self.sequences[yy]

                X_train, X_test = np.array(X_train), np.array(X_test)
                len_train, len_test = np.array(
                    self.lengths)[cv_train_idx], np.array(
                        self.lengths)[cv_test_idx]

                try:
                    fitted_model = GaussianHMM(n_components=Nb_states,
                                               covariance_type="diag",
                                               n_iter=1000,
                                               random_state=self.random_state,
                                               verbose=False).fit(
                                                   X_train, len_train)

                    LogL = fitted_model.score(X_test, len_test)

                    CountLL += 1
                except:
                    LogL = 0

                TotalLL += LogL

            AvgTempLL = TotalLL / (CountLL * 1.0)

            # To avoid usage of dictionary with stored AvgLogL parameters for each state,
            # we check after each iteration the current value of selector and if it is better we found before,
            # we assign it to the variable and store best model, we found so far .
            if AvgTempLL > BestAvgLL:
                BestAvgLL = AvgTempLL
                best_model = fitted_model

        return best_model
예제 #15
0
matrix2 = []
lengths = []
lengths2 = []

a = 0
for beat in beats:
    lengths.append(len(beat))
    matrix = matrix + beat

for beat_loudness in beat_loudnesses:
    lengths2.append(len(beat_loudness))
    matrix2 = matrix2 + beat_loudness

print("fitting to HMM and decoding ...", end="")

model = GaussianHMM(n_components=1, covariance_type="spherical",
                    n_iter=1000).fit(np.atleast_2d(matrix).T, lengths)

f = open(
    '/home/ysj/Downloads/어쿠스틱 콜라보-그대와 나, 설레임 (Feat. 소울맨)_percussive_beat.csv',
    'r')
csvReader = csv.reader(f)

test = []

beat = 0.0
for row in csvReader:

    test.append(float(row[0]) - float(beat))
    beat = row[0]

f.close()
예제 #16
0
파일: hmm.py 프로젝트: matrixleon18/SHUFE
ma20 = ma20[1:]
vma5 = vma5[1:]
vma10 = vma10[1:]
vma20 = vma20[1:]

X = np.column_stack([diff, open, high, close, low, volume, ma5, ma10, ma20, vma5, vma10, vma20])

print("观测值:")
print(X)

diff_v = diff.reshape(-1, 1)


n = 4

model = GaussianHMM(n_components=n, n_iter=1000, covariance_type='full', tol=0.0001)

model = model.fit(X)

print("样本量:")
print(X.shape)
print("给定的隐藏特征数目:")
print(n)
print("初始的隐藏状态概率π:")
print(model.startprob_)
print("状态转移矩阵A参数:")
print(model.transmat_)
print("估计均值:")
print(model.means_)
print("估计方差:")
print(model.covars_)
예제 #17
0
                            normed=1,
                            facecolor='green',
                            alpha=0.75)

plt.show()

# Observation sequences matrix
A = np.column_stack([logDel, logRet_5, logVol_5])

# Rescaled observation sequences matrix
rescaled_A = np.column_stack(
    [rescaled_boxcox_logDel, rescaled_logRet_5, rescaled_logVol_5])

# HMM modeling based on raw observation sequences

model = GaussianHMM(n_components=3, covariance_type="full",
                    n_iter=2000).fit([A])
hidden_states = model.predict(A)
hidden_states

# Plot the hidden states
plt.figure(figsize=(25, 18))
for i in range(model.n_components):
    pos = (hidden_states == i)
    plt.plot_date(Date[pos],
                  close[pos],
                  'o',
                  label='hidden state %d' % i,
                  lw=2)
    plt.legend(loc="left")

# Trading test according to the hidden states
예제 #18
0
def hmm_weight(df, data_raw, day, n_components, plot=False):

    tr_start, tr_end, te_start, te_end = train_test(day, df)
    col_list = ['update_date', 'open', 'high', 'low', 'close']
    df = df.loc[:, col_list]
    df = df.dropna(axis=0)
    data_raw = data_raw.loc[:, col_list]
    data_raw = data_raw.dropna(axis=0)

    train_df = df.loc[df['update_date'] >= tr_start, :].loc[
        df['update_date'] <= tr_end, :]
    test_df = df.loc[df['update_date'] >= te_start, :].loc[
        df['update_date'] <= te_end, :]

    train_close = data_raw.loc[data_raw['update_date'] >= tr_start, :].loc[
        data_raw['update_date'] <= tr_end, :]
    test_close = data_raw.loc[data_raw['update_date'] >= te_start, :].loc[
        data_raw['update_date'] <= te_end, :]

    if len(train_df) > 0 and len(test_df) > 0:
        r_5 = np.array(
            np.array(np.log(train_df['close'][5:])) -
            np.array(np.log(train_df['close'][:-5])))[:]
        # r_10 = np.array(np.array(np.log(train_df['close'][10:])) - np.array(np.log(train_df['close'][:-10])))

        r_1 = np.array(
            np.array(np.log(train_df['close'][1:])) -
            np.array(np.log(train_df['close'][:-1])))[4:]

        r_range = np.array((np.array(np.log(train_df['high'])) -
                            np.array(np.log(train_df['low']))))[5:]

        r_1 = np.array(
            map(
                lambda x: 0
                if x == np.inf or x == -np.inf or np.isnan(x) else x, r_1))
        r_5 = np.array(
            map(
                lambda x: 0
                if x == np.inf or x == -np.inf or np.isnan(x) else x, r_5))
        # r_10 = np.array(map(lambda x: 0 if x==np.inf or x==-np.inf or np.isnan(x) else x, r_10))
        r_range = np.array(
            map(
                lambda x: 0
                if x == np.inf or x == -np.inf or np.isnan(x) else x, r_range))

        r_1_no_lag = list(r_1[1:])
        r_1_no_lag.append(0)
        r_1_no_lag = np.array(r_1_no_lag)

        date_list = train_df['update_date'][5:]

        r_5_test = np.array(
            np.array(np.log(test_df['close'][5:])) -
            np.array(np.log(test_df['close'][:-5])))[:]
        # r_10_test = np.array(np.array(np.log(test_df['close'][10:])) - np.array(np.log(test_df['close'][:-10])))
        r_1_test = np.array(
            np.array(np.log(test_df['close'][1:])) -
            np.array(np.log(test_df['close'][:-1])))[4:]
        r_1_test = np.array(
            map(
                lambda x: 0
                if x == np.inf or x == -np.inf or np.isnan(x) else x,
                r_1_test))
        r_5_test = np.array(
            map(
                lambda x: 0
                if x == np.inf or x == -np.inf or np.isnan(x) else x,
                r_5_test))
        # r_10_test = np.array(map(lambda x: 0 if x==np.inf or x==-np.inf or np.isnan(x) else x, r_10_test))

        r_1_test_no_lag = list(r_1_test[1:])
        r_1_test_no_lag.append(0)
        r_1_test_no_lag = np.array(r_1_test_no_lag)

        r_range_test = np.array(
            np.array(np.log(test_df['high'])) -
            np.array(np.log(test_df['low'])))[5:]
        r_range_test = np.array(
            map(
                lambda x: 0
                if x == np.inf or x == -np.inf or np.isnan(x) else x,
                r_range_test))

        date_list_test = test_df['update_date'][5:]

        X = np.column_stack([r_1, r_5, r_range])

        X_test = np.column_stack([r_1_test, r_5_test, r_range_test])
        if X.shape[0] >= n_components and X_test.shape[0] >= n_components:

            hmm = GaussianHMM(n_components=n_components,
                              covariance_type='diag',
                              n_iter=2000).fit(X)
            latent_states_sequence_train = hmm.predict(X)

            mean_return_dict = {}
            if plot == True:
                import matplotlib.pyplot as plt
                import seaborn as sns
                sns.set_style('white')
                plt.figure(figsize=(15, 8))

                for i in range(hmm.n_components):
                    state = (latent_states_sequence_train == i)
                    sharpe = (np.mean(r_1_no_lag[state]) * 252 - 0.03) / (
                        np.std(r_1_no_lag[state]) * np.sqrt(252))
                    plt.plot(date_list[state],
                             train_close['close'][state],
                             'o',
                             label='latent state %d: %s' % (i, sharpe),
                             lw=5)
                    plt.legend()
                    plt.grid(1)
                    mean_return_dict[i] = sharpe

                plt.show()
            else:
                for i in range(hmm.n_components):
                    state = (latent_states_sequence_train == i)
                    mean_return_dict[i] = (np.mean(r_1_no_lag[state]) * 252 -
                                           0.03) / (np.std(r_1_no_lag[state]) *
                                                    np.sqrt(252))

            latent_states_sequence_test = hmm.predict(X_test)

            pair = mean_return_dict.items()
            pair = filter(lambda x: False if np.isnan(x[1]) else True, pair)
            pair_sorted = sorted(pair, key=lambda x: x[1])
            highest = pair_sorted[-1]
            lowest = pair_sorted[0]
            # print pair_sorted

            expected_return_series = map(lambda x: mean_return_dict[x],
                                         latent_states_sequence_test)
            expected_return_series = np.array(
                map(lambda x: 1 if x > 0 else -1, expected_return_series[:-1]))
            real_return_series = r_1_test[1:]
            real_return_series = np.array(
                map(lambda x: 1 if x > 0 else -1, real_return_series))

            temp = expected_return_series - real_return_series
            temp = filter(lambda x: True
                          if np.isnan(x) == False else False, temp)
            # acc_rate=(len(temp) - np.sum(np.abs(temp)) / 2.) / len(temp)
            # print acc_rate
            real_return_series = list(real_return_series)
            # print real_return_series.count(1) / float(len(real_return_series))
            # print real_return_series.count(-1) / float(len(real_return_series))
            # print 'time: ',np.max(date_list_test),'expected Sharpe: ',mean_return_dict[latent_states_sequence_test[-1]]

            prediction = pd.DataFrame()
            prediction['update_date'] = date_list_test
            prediction['state'] = latent_states_sequence_test
            prediction['expected_sharpe'] = prediction['state'].apply(
                lambda x: mean_return_dict[x])

            if plot == True:
                sns.set_style('white')
                plt.figure(figsize=(8, 4))
                for i in range(hmm.n_components):
                    state = (latent_states_sequence_test == i)
                    plt.plot(date_list_test[state],
                             test_close['close'][state],
                             'o',
                             label='latent state %d: %s' %
                             (i, mean_return_dict[i]),
                             lw=5)
                    plt.grid(1)
                    plt.legend()

                plt.show()
            else:
                pass
            if plot == True:
                sns.set_style('white')
                plt.figure(figsize=(15, 10))
                # plt.subplot(2,1,1)
                new_frame = copy.deepcopy(prediction)
                new_frame.index = [new_frame['update_date']]
                new_frame['expected_return'] = new_frame[
                    'expected_sharpe'].apply(lambda x: 30 if x > 0 else -30)
                test_close.index = [test_close['update_date']]
                test_close['close'] = test_close['close'] - 420
                test_close = test_close[np.min(new_frame['update_date']):np.
                                        max(new_frame['update_date'])]
                plt.plot(test_close['close'], 'o-', color='red')
                # plt.subplot(2,1,2)
                plt.bar(new_frame.index,
                        new_frame['expected_return'],
                        align='edge',
                        alpha=0.5,
                        color='yellow')
                plt.show()

            return prediction, highest, lowest
        else:
            return None, None, None
    else:
        return None, None, None
예제 #19
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        #pdb.set_trace()

        # Variable to hold the best scores-and-model across CV iterations
        best_score = -math.inf

        # Initialize scikit hmm object with default parameters
        best_model = GaussianHMM()

        # Initialize values for CV split object.
        # FIXME This could be achieved more eloquently: rough adjustments to
        # allow code to account for scenarios encountered within Recognizer data.
        if len(self.sequences) <= 2:
            splits = 2
        else:
            splits = 3

        split_method = KFold(n_splits=splits)

        # Iterate through states: for each state cross-validate n times.
        # Language and structure based on the CV snippet from the notebook
        # The execution snippet provides man and min components.
        for num_hidden_states in range(self.min_n_components,
                                       self.max_n_components + 1):
            try:

                # Return index values for fold splits
                # FIXME should there be a random parameter here?
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):

                    try:
                        #pdf.set_trace()

                        # To USE the index that we get from our folds we need to use the
                        # provided function.

                        # For training data
                        X_train, X_train_lengths = combine_sequences(
                            cv_train_idx, self.sequences)

                        # For test data
                        X_test, X_test_lengths = combine_sequences(
                            cv_test_idx, self.sequences)

                        # Fit the model on the fold data (training) and current number
                        # of states.
                        # Note that the number of iterations use here is a carry over
                        # from the notebook.
                        model = GaussianHMM(n_components=num_hidden_states,
                                            n_iter=1000)
                        model.fit(X_train, X_train_lengths)

                        # Return score on the test data
                        logL = model.score(X_test, X_test_lengths)

                        # Control flow to test for high-scores
                        if logL >= best_score:

                            best_model, best_score = model, logL

                    except:
                        continue

            except:
                continue

        return best_model
예제 #20
0
import numpy as np
from hmmlearn.hmm import GaussianHMM

action_kfs = np.load('/home/user/Desktop/action_kf.npy')

for s in [2, 3, 4, 5]:
    hmm = GaussianHMM(s)
    hmm.fit(action_kfs[:, :, 1:].reshape(-1, 7), [6] * 3)
    print hmm.score(action_kfs[:, :, 1:].reshape(-1, 7), [6] * 3)
    print[np.linalg.norm(c) for c in hmm.covars_]
예제 #21
0
tradeDate = pd.to_datetime(data['tradeDate'][5:])  #日期列表
volume = data['turnoverVol'][5:]  #2 成交量数据
closeIndex = data['closeIndex']  # 3 收盘价数据
deltaIndex = np.log(np.array(data['highestIndex'])) - np.log(
    np.array(data['lowestIndex']))  #3 当日对数高低价差
deltaIndex = deltaIndex[5:]
logReturn1 = np.array(np.diff(np.log(closeIndex)))  #4 对数收益率
logReturn1 = logReturn1[4:]
logReturn5 = np.log(np.array(closeIndex[5:])) - np.log(
    np.array(closeIndex[:-5]))  # 5日 对数收益差
logReturnFst = np.array(np.diff(np.log(tradeVal['tradeVal'])))[4:]
closeIndex = closeIndex[5:]
X = np.column_stack([logReturn1, logReturn5, deltaIndex, volume,
                     logReturnFst])  # 将几个array合成一个2Darray
# Make an HMM instance and execute fit
model = GaussianHMM(n_components=3, covariance_type="diag",
                    n_iter=1000).fit([X])
# Predict the optimal sequence of internal hidden state
hidden_states = model.predict(X)
print hidden_states
res = pd.DataFrame({
    'tradeDate': tradeDate,
    'logReturn1': logReturn1,
    'logReturn5': logReturn5,
    'volume': volume,
    'hidden_states': hidden_states
}).set_index('tradeDate')
for i in range(model.n_components):
    idx = (hidden_states == i)
    idx = np.append(0, idx[:-1])  #获得状态结果后第二天进行买入操作
    #fast factor backtest
    df = res.logReturn1
예제 #22
0
    x1 = np.zeros((1, dimensions))

    x1_lengths = []

    test_data = sorted(glob.glob(path1))  #!!!!!!!!!! select path !!!!!!!!!

    for i in range(16):

        data = np.loadtxt(test_data[i], delimiter=' ')[:, usedJoints] - 100

        x1_lengths = []
        x1_lengths.append(data.shape[0])
        #different topology results to different results and needs different states configuration
        model1 = GaussianHMM(
            n_components=states,
            covariance_type='diag',
        ).fit(data, x1_lengths)

        score = 0
        #loop for finding mean of the log-likelihoods in each iteration
        for file in test_data:
            data2 = np.loadtxt(file, delimiter=' ')[:, usedJoints] - 100
            score_mode1 = model1.score(data2)
            score += score_mode1 / 16
        scoreList2.append(score)
    winner_prob = max(scoreList2)
    index = scoreList2.index(max(scoreList2))
    best_iter = test_data[index]

    stateList.append(states)
    winnerList.append(best_iter)
예제 #23
0
    if mus[0] > mus[1]:
        mus = np.flipud(mus)
        sigmas = np.flipud(sigmas)
        P = np.fliplr(np.flipud(P))
        hidden_states = 1 - hidden_states

    return hidden_states, mus, sigmas, P, logProb, samples

'''

# %%

Q = data.iloc[10, 6]

# hidden_states, mus, sigmas, P, logProb, samples = fitHMM(Q, 100)
model = GaussianHMM(n_components=4, n_iter=500).fit(np.reshape(Q, [len(Q), 1]))
hidden_states = model.predict(np.reshape(Q, [len(Q), 1]))
# find parameters of Gaussian HMM
mus1 = np.array(model.means_)
sigmas = np.array(
    np.sqrt(
        np.array([
            np.diag(model.covars_[0]),
            np.diag(model.covars_[1]),
            np.diag(model.covars_[2]),
            np.diag(model.covars_[3])
        ])))
P = np.array(model.transmat_)

# %%
예제 #24
0
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from hmmlearn.hmm import GaussianHMM
import numpy as np

#samples:
X = np.array([[-1.03573482, -1.03573482], [6.62721065, 11.62721065],
              [3.19196949, 8.19196949], [0.38798214, 0.38798214],
              [2.56845104, 7.56845104], [5.03699793, 10.03699793],
              [5.87873937, 10.87873937], [4.27000819, -1.72999181],
              [4.02692237, -1.97307763], [5.7222677, 10.7222677]])

# Trainning a new model over samples:
model = GaussianHMM(n_components=3, covariance_type="diag").fit(X)

# Create a new copy of the trained model:
new_model = GaussianHMM(n_components=3, covariance_type="diag")
new_model.startprob_ = model.startprob_
new_model.transmat_ = model.transmat_
new_model.means_ = model.means_
m = model._covars_
n = model.covars_
p = model.get_params()
new_model.covars_ = model._covars_

# Predict from X:
X_N = new_model.predict(X)

print(X_N)
예제 #25
0
def MyGaussianHMM():
    from hmmlearn.hmm import GaussianHMM
    df = pd.read_csv(
        "/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv",
        header=-1)
    df.head()
    X = np.array(df.iloc[:, 0:5])

    # 一、未知模型情况下,解决问题3
    model = GaussianHMM(n_components=6, covariance_type="diag",
                        n_iter=1000)  # 方差矩阵为对角阵
    """
    参数解释:
    covariance_type:
        "spherical"     :主对角元素均为1,其余元素为0,独立同分布  (数据不足时,难以进行参数估计)
        "diag"          :主对角元素不为0,其余为0               (一般情况,折中)
        "full"          :所有元素均不为0                      (数据足够进行参数估计时)
    """
    model.fit(X)
    print "隐含状态为: ", model.predict(X)  # 列出每一天的隐含状态
    print "特征数目 %s" % model.n_features
    print "隐状态数目 %s" % model.n_components
    print "起始概率 :", model.startprob_
    print "隐状态转移矩阵", model.transmat_
    ## 每个隐含层对应的特征概率空间假设为正态分布,则可以得到一个model.n_components行model.n_features列的均值矩阵
    print "混淆矩阵:均值部分", model.means_
    print "混淆矩阵:方差部分", model.covars_

    ## 绘图
    hidden_states = model.predict(X)
    tradeDate = df.iloc[:, 5].values
    closeIndex = df.iloc[:, 6].values
    plt.figure(figsize=(15, 8))
    for i in range(model.n_components):
        idx = (hidden_states == i)
        plt.plot_date(pd.to_datetime(tradeDate[idx]),
                      closeIndex[idx],
                      '.',
                      label='%dth hidden state' % i,
                      lw=1)
        plt.legend()
        plt.grid(1)
    plt.show()

    # 二、已知模型情况下,解决问题1,2

    ## 沿用上述模型
    ### 问题1
    print "某天出现该观测的概率为: %s" % np.exp(model.score(X[0]))
    ### 问题2
    log_prob, state = model.decode(X[:10], algorithm="viterbi")
    print "只根据前十天,推断出最有可能的隐含状态序列为:", state

    ## 自己输入模型参数
    ### 一个2特征,4隐状态情况
    startprob = np.array([0.6, 0.3, 0.1, 0.0])
    # The transition matrix, note that there are no transitions possible
    # between component 1 and 3
    transmat = np.array([[0.7, 0.2, 0.0, 0.1], [0.3, 0.5, 0.2, 0.0],
                         [0.0, 0.3, 0.5, 0.2], [0.2, 0.0, 0.2, 0.6]])
    # The means of each component
    means = np.array([[0.0, 0.0], [0.0, 11.0], [9.0, 10.0], [11.0, -1.0]])
    # The covariance of each component
    covars = .5 * np.tile(np.identity(2), (4, 1, 1))
    model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000)
    model2.startprob_ = startprob
    model2.transmat_ = transmat
    model2.means_ = means
    model2.covars_ = covars
예제 #26
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        kfold_splits = min(len(self.sequences),
                           3)  #Use 3 split unless we have fewer sequences
        best_score = float("-inf")
        best_num_components = 3
        word_sequences = self.sequences

        if kfold_splits >= 2:
            if kfold_splits < 3:
                if self.verbose:
                    print("For {} using a kfold split of {}.".format(
                        self.this_word, kfold_splits))
            split_method = KFold(random_state=self.random_state,
                                 n_splits=kfold_splits)
            fold_indices = list(split_method.split(word_sequences))
        else:
            if self.verbose:
                print(
                    "Sequences for {} is less than 2.  Creating model with {} states."
                    .format(self.this_word, best_num_components))
            hmm_model = self.base_model(best_num_components)
            return hmm_model

        for num_components in range(self.min_n_components,
                                    self.max_n_components + 1):
            scores = []

            for cv_train_idx, cv_test_idx in fold_indices:
                train_x, train_x_lengths = combine_sequences(
                    cv_train_idx, word_sequences)
                test_x, test_x_lengths = combine_sequences(
                    cv_test_idx, word_sequences)
                try:
                    hmm_model = GaussianHMM(n_components=num_components,
                                            covariance_type="diag",
                                            n_iter=1000,
                                            random_state=self.random_state,
                                            verbose=False).fit(
                                                train_x, train_x_lengths)

                    log_l = hmm_model.score(test_x, test_x_lengths)
                    scores.append(log_l)
                except Exception as e:
                    if self.verbose:
                        print("Model train error on {} with {} states".format(
                            self.this_word, num_components))

                    #Discard this model.
                    hmm_model = None
                    break

            if hmm_model is None:
                # Stop increasing complexity since the current model failed.
                break

            if len(scores) == 1:
                avg = scores[0]
            else:
                avg = np.average(scores)

            if best_score < avg:
                best_score = avg
                best_num_components = num_components

        #Train the model with the full set of data
        model = self.base_model(best_num_components)

        return model
예제 #27
0
end = '2016-10-21'                         # 回测结束时间
ticker_name='000001'
data_sz=DataAPI.MktIdxdGet(ticker=ticker_name,beginDate=start,endDate=end,field=u"",pandas="1")
data=data_sz[['tradeDate','preCloseIndex','openIndex','lowestIndex','highestIndex','closeIndex','turnoverVol','turnoverValue']]
print data[0:5]
volume=data['turnoverVol']
close=data['closeIndex']
close2=data['preCloseIndex']
logDel = np.log(np.array(data['highestIndex'])) - np.log(np.array(data['lowestIndex']))
logRet_1 = np.array(np.diff(np.log(close2)))                         #这个作为后面计算收益使用
logRet_5 = np.log(np.array(close[5:])) - np.log(np.array(close[:-5]))#5日指数对数收益差
logVol_5 = np.log(np.array(volume[5:])) - np.log(np.array(volume[:-5]))
logDel = logDel[5:]
logRet_1 = logRet_1[4:]
close = close[5:]
Date = pd.to_datetime(data['tradeDate'][5:])
A = np.column_stack([logDel,logRet_5,logVol_5])#3个特征 理解成3维数据
print A[0:2]                                   #格式注意

#build model
n = 3 #6个隐藏状态
model = GaussianHMM(n_components= n, covariance_type="full", n_iter=2000).fit([A])
hidden_states = model.predict(A)
hidden_states[0:10]

plt.figure(figsize=(14, 6)) 
for i in range(model.n_components):
    pos = (hidden_states==i)
    plt.plot_date(Date[pos],close[pos],'o',label='hidden state %d'%i,lw=3)
    plt.legend(loc="left")
예제 #28
0
dates = np.array(apple["Close"].index.levels[1])
close_v = np.array(apple["Close"].values)
volume = np.array(apple["Volume"].values)[1:]

# Get the variation of the price
diff = np.diff(close_v)
dates = dates[1:]
close_v = close_v[1:]

# Scale: Normalize
# Input the stock return and
X = np.column_stack([scale(diff), scale(volume)])

# Train Gaussian Model, Assume 4 hidden states
model = GaussianHMM(n_components=4, covariance_type="full", n_iter=20)
model.fit(X)

# Prediction the hidden layers
hidden_states = model.predict(X)

# Print the parameters
print("Transition matrix: ", model.transmat_)
print("Means and vars of each hidden state")
for i in range(4):
    print("{0}th hidden state".format(i))
    print("mean = ", model.means_[i])
    print("var = ", model.covars_[i])
print()

fig, axs = plt.subplots(4, sharex=True, sharey=True)
예제 #29
0
# take diff of close value
# this makes len(diff) = len(close_t) - 1
# therefore, others quantity also need to be shifted
diff = close_v[1:] - close_v[:-1]
dates = dates[1:]
close_v = close_v[1:]

# pack diff and volume for training
X = np.column_stack([diff, volume])
print(X[0:3])
###############################################################################
# Run Gaussian HMM
print("fitting to HMM and decoding ...", end='')

# make an HMM instance and execute fit
model = GaussianHMM(n_components=5, covariance_type="diag", n_iter=1000).fit(X)

# predict the optimal sequence of internal hidden state
hidden_states = model.predict(X)

print("done\n")

###############################################################################
# print trained parameters and plot
print("Transition matrix")
print(model.transmat_)
print()

print("means and vars of each hidden state")
for i in range(model.n_components):
    print("%dth hidden state" % i)
예제 #30
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        ## Implement model selection using BIC
        # Initial values
        best_score = float("Inf")
        best_num_states = 2
        ## Iterate through a number of states to test which is the best representation
        for num_states in range(self.min_n_components, self.max_n_components + 1):
            BIC_score = 0
            
            try:
                # Catch case if n_samples > n_states
                # if len(self.X) < num_states:
                if num_states > sum(self.lengths):
                    return None
                else:
                    # print(self.this_word)
                    # print("Number of samples {}".format(sum(self.lengths)))
                    # print("Length of self.x[0] {}".format(len(self.X[0])))
                    # print("Length of self.x {}".format(len(self.X)))
                    # print("Shape size X[0] {}".format(self.X.shape[0]))
                    # print("Number of states {}".format(num_states))
                    # HMM Model building - num_states is our parameter that is found using CV
                    hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000,
                                random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    if self.verbose:
                        print("model created for {} with {} states".format(self.this_word, num_states))
                    # Log-likelihood score
                    logL = hmm_model.score(self.X, self.lengths)
                    # Number of parameters used by the model - HMMs are defined by the transition probabilities,
                    # the emission probabilities, initial probability, means and variance of distribution
                    # Let n be the number of states and m be the number of features
                    # Transition probabilities -> n * (n - 1) since for the last prob, we can find it through (1 - all other prob)
                    # Initial probabilites -> n - 1 since we have n possible states to start in but last state can found via (1 - n)
                    # Means of distributions -> n * m means as there is a distribution for each features in each state
                    # Variance of distributions -> n * m variances as there needs to be a variance for each distribution and we are 
                    # also using normal distributions
                    # This gives us n^2 + 2nm - 1
                    n_samples, n_features = self.X.shape
                    n_params = num_states ** 2 + (2 * num_states * n_features) - 1

                    # BIC score
                    BIC_score = (-2 * logL) + (n_params * math.log(n_samples))
            except:
                if self.verbose:
                    print("failure on {} with {} states".format(self.this_word, num_states))
                    return None
            ## Tracking best score and best number of states parameter - the lower the BIC the better
            if BIC_score < best_score:
                best_score = BIC_score
                best_num_states = num_states

        ## Build the best hmm model using all data once parameter has been finalized
        # print("CURRENT WORD: {}".format(self.this_word))
        best_hmm_model = GaussianHMM(n_components=best_num_states, covariance_type="diag", n_iter=1000,
                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
        if self.verbose:
            print("Best model created for {} with {} states".format(self.this_word, best_num_states))

        return best_hmm_model