def __init__(self, cov, method='single', metric='euclidean'): """ Combines the assets in `data` using HRP returns an object with the following attributes: - 'cov': covariance matrix of the returns - 'corr': correlation matrix of the returns - 'sort_ix': list of sorted column names according to cluster - 'link': linkage matrix of size (N-1)x4 with structure Y=[{y_m,1 y_m,2 y_m,3 y_m,4}_m=1,N-1]. At the i-th iteration, clusters with indices link[i, 0] and link[i, 1] are combined to form cluster n+1. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters link[i, 0] and link[i, 1] is given by link[i, 2]. The fourth value link[i, 3] represents the number of original observations in the newly formed cluster. - 'weights': final weights for each asset :param data: pandas DataFrame where each column is a series of returns :param method: any method available in scipy.cluster.hierarchy.linkage :param metric: any metric available in scipy.cluster.hierarchy.linkage """ assert isinstance( cov, pd.DataFrame), "input 'cov' must be a pandas DataFrame" self.cov = cov self.corr, self.vols = cov2corr(cov) self.method = method self.metric = metric self.link = self._tree_clustering(self.corr, self.method, self.metric) self.sort_ix = self._get_quasi_diag(self.link) self.sort_ix = self.corr.index[self.sort_ix].tolist() # recover labels self.sorted_corr = self.corr.loc[ self.sort_ix, self.sort_ix] # reorder correlation matrix self.weights = self._get_recursive_bisection(self.cov, self.sort_ix)
def random_correlation(size, n_factors, random_seed=None): """ Generates a random correlation matrix with 'size' lines and columns and 'n_factors' factors in the underlying structure of correlation. :param size: int. Size of the correlation matrix :param n_factors: int. number of factors in the correlation structure :param random_seed: int. random seed number :return: numpy.array. correlation matrix """ cov = random_covariance(size, n_factors, random_seed) corr, _ = cov2corr(cov) return corr
def marchenko_pastur(df, bandwidth=0.1): """ Uses the Marchenko-Pastur theorem to remove noisy eigenvalues from a correlation matrix. This code is adapted from Lopez de Prado (2020). @param df: pandas.DataFrame. Time series of returns. @param bandwidth: smoothing parameter for the KernelDensity estimation @return: 'corr' is the denoised correlation matrix, 'nFacts' is the number of non-random factors in the original correlation matrix and 'var' is the estimate of sigma**2, which can be interpreted as the % of noise in the original correlationm matrix. """ emp_cov_matrix = df.dropna().cov() corr_matrix, vols = cov2corr(emp_cov_matrix) T, N = df.dropna().shape # get eigenvalues and eigenvectors eVal, eVec = np.linalg.eigh(corr_matrix) indices = eVal.argsort()[::-1] eVal, eVec = eVal[indices], eVec[:, indices] eVal = np.diagflat(eVal) # find sigma that minimizes the error to the Marchenko-Pastur distribution q = T / N eMax, var = _find_max_eigval(np.diag(eVal), q, bWidth=bandwidth) # number of factors (signals) nFacts = eVal.shape[0] - np.diag(eVal)[::-1].searchsorted(eMax) eVal_ = np.diag(eVal).copy() eVal_[nFacts:] = eVal_[nFacts:].sum() / float(eVal_.shape[0] - nFacts) eVal_ = np.diag(eVal_) cov = np.dot(eVec, eVal_).dot(eVec.T) corr, _ = cov2corr(cov) cov = corr2cov(corr, vols) cov = pd.DataFrame(data=cov, index=df.columns, columns=df.columns) return cov, nFacts, var
def targeted_shirinkage(df, bandwidth=0.1, ts_alpha=0.5): """ Uses the Marchenko-Pastur theorem to find noisy eigenvalues from a correlation matrix and performs shrinkage only on the noisy part of the correlation matrix. This code is adapted from Lopez de Prado (2020). :param df: pandas.DataFrame. Time series of returns. :param bandwidth: smoothing parameter for the KernelDensity estimation :param ts_alpha: float. Number between 0 and 1 indicating the ammount of targeted shrinkage on the random eigenvectors. ts_alpha=0 means total shrinkage and ts_alpha=1 means no shrinkage. :return: 'corr' is the denoised correlation matrix, 'nFacts' is the number of non-random factors in the original correlation matrix and 'var' is the estimate of sigma**2, which can be interpreted as the % of noise in the original correlationm matrix. """ assert 0 <= ts_alpha <= 1, "'ts_alpha' must be between 0 and 1." cov_matrix = df.dropna().cov() corr_matrix, vols = cov2corr(cov_matrix) T, N = df.dropna().shape # get eigenvalues and eigenvectors eVal, eVec = np.linalg.eigh(corr_matrix) indices = eVal.argsort()[::-1] eVal, eVec = eVal[indices], eVec[:, indices] eVal = np.diagflat(eVal) # find sigma that minimizes the error to the Marchenko-Pastur distribution q = T / N eMax, var = _find_max_eigval(np.diag(eVal), q, bWidth=bandwidth) # number of factors (signals) nFacts = eVal.shape[0] - np.diag(eVal)[::-1].searchsorted(eMax) # targeted shrinkage eValL, eVecL = eVal[:nFacts, :nFacts], eVec[:, :nFacts] eValR, eVecR = eVal[nFacts:, nFacts:], eVec[:, nFacts:] corrL = np.dot(eVecL, eValL).dot(eVecL.T) corrR = np.dot(eVecR, eValR).dot(eVecR.T) corr = corrL + ts_alpha * corrR + (1 - ts_alpha) * np.diag(np.diag(corrR)) cov = corr2cov(corr, vols) cov = pd.DataFrame(data=cov, index=df.columns, columns=df.columns) return cov, nFacts, var
def shrink_cov(df, alpha=0.1): """ Applies shirinkage to the covariance matrix without changing the variance of each factor. This method differs from sklearn's method as this preserves the main diagonal of the covariance matrix, making this a more suitable method for financial data. :param df: pandas.DataFrame. Data frame with returns data :param alpha: float. A number between 0 and 1 that represents the shrinkage intensity. :return: numpy array. Shrunk Covariance matrix. """ # TODO Example assert 0 <= alpha <= 1, "'alpha' must be between 0 and 1" cov = df.dropna().cov() vols = np.sqrt(np.diag(cov)) corr, _ = cov2corr(cov) shrunk_corr = (1 - alpha) * corr + alpha * np.eye(corr.shape[0]) shrunk_cov = np.diag(vols) @ shrunk_corr @ np.diag(vols) if isinstance(cov, pd.DataFrame): shrunk_cov = pd.DataFrame(data=shrunk_cov.values, index=cov.index, columns=cov.columns) return shrunk_cov
# Grab total return indexes df = tracker_feeder() df = df[df.index >= '2010-01-01'] # Grab funding series sgs = SGS() df_cdi = sgs.fetch({12: 'CDI'}) df_cdi = df_cdi / 100 # Compute ERIs df_eri = compute_eri(total_return_index=df, funding_return=df_cdi['CDI']) df_returns = df_eri.pct_change(1).dropna() # Correlation emp_cov = empirical_covariance(df_returns) emp_corr, _ = cov2corr(emp_cov) # print(emp_corr, '\n') # Shirinkage shrunk_cov = shrink_cov(df_returns, alpha=0.5) shrunk_corr, _ = cov2corr(shrunk_cov) # print(shrunk_corr, '\n') # Marchenko-Pastur mp_cov, _, _ = marchenko_pastur(df_returns) mp_corr, _ = cov2corr(mp_cov) # print(mp_corr, '\n') # Targeted Shrinkage ts_cov, _, _ = targeted_shirinkage(df_returns, ts_alpha=0.5) # TODO deveria dar o MP? ts_corr, _ = cov2corr(ts_cov)
# optimization bl = BlackLitterman( sigma=mp_cov, estimation_error=1 / (21 * 3), views_p=P, views_v=v, w_equilibrium=weights_iv.loc[date].to_frame(), avg_risk_aversion=1.2, mu_historical=df_mom.loc[date].to_frame('Historical'), mu_shrink=mu_shrink, # needs to be tuned overall_confidence=overall_confidence) # needs to be tuned vol_bl = pd.Series(data=np.sqrt(np.diag(bl.sigma_bl)), index=bl.sigma_bl.index) corr_bl = cov2corr(bl.sigma_bl) mkw = MaxSharpe(mu=bl.mu_bl, sigma=vol_bl, corr=corr_bl, rf=(1 + df_libor.loc[date, 'US 3m LIBOR'])**0.25 - 1, risk_aversion=1.2) weights_mpbl.loc[date] = mkw.risky_weights next_rebalance_date = date + pd.DateOffset(months=1) next_rebalance_date = start_date for date in tqdm(calendar, 'HRP'): if date >= next_rebalance_date: try: hrp = HRP(cov=df_cov.loc[date])
def fit(self, fit_iter=100, n_states=None, max_state_number=8, select_iter=10): if n_states is None: self.n_states = self.select_order(max_state_number=max_state_number, select_iter=select_iter, show_chart=False) else: self.n_states = n_states # Estimate the model several times, due to instability, and grab the one with the highest score. model_dict = dict() for _ in tqdm(range(fit_iter), 'Estimating HMM'): model = hmm.GaussianHMM(n_components=self.n_states, covariance_type='full', n_iter=1000) model.fit(self.returns) model_dict[model.score(self.returns)] = model chosen_model = model_dict[max(model_dict.keys())] sort_order = np.flip(np.argsort(np.diag(chosen_model.transmat_))) # Build the sorted model sorted_model = hmm.GaussianHMM(n_components=self.n_states, covariance_type='full') sorted_model.startprob_ = chosen_model.startprob_[sort_order] sorted_model.transmat_ = pd.DataFrame(chosen_model.transmat_).loc[sort_order, sort_order].values sorted_model.means_ = chosen_model.means_[sort_order, :] sorted_model.covars_ = chosen_model.covars_[sort_order, :, :] try: column_labels = self.returns.columns time_index = self.returns.index except AttributeError: column_labels = [f'Asset {s + 1}' for s in range(self.n_var)] time_index = range(self.returns.shape[0]) self.score = sorted_model.score(self.returns) self.trans_mat = pd.DataFrame(data=sorted_model.transmat_, index=[f'From State {s + 1}' for s in range(self.n_states)], columns=[f'To State {s + 1}' for s in range(self.n_states)]) self.avg_duration = pd.Series(data=1 / (1 - np.diag(sorted_model.transmat_)), index=[f'State {s + 1}' for s in range(self.n_states)], name='Average Duration') self.stationary_dist = pd.Series(data=sorted_model.get_stationary_distribution(), index=[f'State {s + 1}' for s in range(self.n_states)], name='Stationary Distribution of States') self.means = pd.DataFrame(data=sorted_model.means_, index=[f'State {s + 1}' for s in range(self.n_states)], columns=column_labels) vol_data = [list(np.sqrt(np.diag(sorted_model.covars_[ss]))) for ss in range(self.n_states)] self.vols = pd.DataFrame(data=vol_data, columns=column_labels, index=[f'State {s + 1}' for s in range(self.n_states)]) idx = pd.MultiIndex.from_product([[f'State {s + 1}' for s in range(self.n_states)], column_labels]) self.covars = pd.DataFrame(index=idx, columns=column_labels, data=sorted_model.covars_.reshape(-1, self.n_var)) corr_data = [cov2corr(sorted_model.covars_[ss])[0] for ss in range(self.n_states)] self.corrs = pd.DataFrame(index=idx, columns=column_labels, data=np.concatenate(corr_data)) self.predicted_state = pd.Series(data=sorted_model.predict(self.returns) + 1, index=time_index, name='Predicted State') freq_data = ('State ' + self.predicted_state.astype(str)).value_counts() / self.predicted_state.count() self.state_freq = pd.Series(data=freq_data, index=[f'State {s + 1}' for s in range(self.n_states)], name='State Frequency') self.state_probs = pd.DataFrame(data=sorted_model.predict_proba(self.returns), index=time_index, columns=[f'State {s + 1}' for s in range(self.n_states)])