from assets_data_collection import read_in_assets_data
from setup_stock_data import convert_to_log_growth, calc_mean_cov_matrix_and_size

# Uses hierarchical clustering to find assets that are similar and different to
# each other based on data from the covariance matrix of the assets
start_time = time.time()

# Read in market data with 7 year lookback from today
example_assets = ['AAPL', 'H', 'Y', 'SNE', 'GS', 'K', 'NKE',
                  'LEVI']  # Stocks from different sectors of the market
today = dt.date(2020, 9, 24)  # When I last ran the script
# today = dt.date.today() # May require changing parameters/tuning
seven_years_ago = today - dt.timedelta(
    days=7 * 365)  # An approximation - doesn't account for leap years
assets_data = read_in_assets_data(
    example_assets, seven_years_ago, today, True,
    directory + '/Data/2013to2019_assets_data_for_clustering.csv')
assets_growth_data = convert_to_log_growth(assets_data)
mean_cov_matrix_and_size = calc_mean_cov_matrix_and_size(assets_growth_data)
stock_growth_means, cov_matrix, num_days = mean_cov_matrix_and_size

# Splitting and preprocessing
scaler = StandardScaler()
scaled_cov_matrix = pd.DataFrame(scaler.fit_transform(cov_matrix),
                                 columns=cov_matrix.columns)

# Hierarchy Visualization with Dendrogram
plt.figure(figsize=(10, 7))
plt.title("Assets Dendrogram")
plt.xlabel('Assets')
plt.ylabel('Distance')
# Performs Bayesian Inference with Markov-Chain Monte-Carlo using the No U-Turn Sampling 
# algorithm to estimate the posterior distribution of Goldman Sachs stock data 
# based on todays data with a lookback of seven years.  

# See the bayesian_model.png file for the stochastic model layout/assumptions. The stochastic model was 
# designed to account for some aspects of the returns but a truely accurate model would be far more 
# complex. The purpose of this is just to demonstrate how to perform bayesian inference with pymc3.
start_time = time.time()
debug = False

# Read in market data with 7 year lookback
example_assets = ['GS']
today = dt.date(2020, 9, 24) # When I last ran the script
# today = dt.date.today() # May require changing parameters/tuning
seven_years_ago = today - dt.timedelta(days=7*365) # An approximation - doesn't account for leap years
asset_data = read_in_assets_data(example_assets, seven_years_ago, today, 
                                  True, directory + '/Data/gs_time_series_7_years.csv')
asset_growth_data = convert_to_log_growth(asset_data)[example_assets[0]]

# Visualize returns
asset_mu = np.mean(asset_growth_data) 
print("Average GS growth over the past 7 years: " + str(asset_mu))
plt.plot(asset_growth_data)
plt.title('GS Growth Over the Past 7 years')
plt.xlabel('Time')
plt.ylabel('Variance')
plot_filename = directory + '/Graphs/gs_growth_data.png'
plt.savefig(plot_filename, bbox_inches='tight')
plt.close()

# Distribution fitting to find the prior distribution