예제 #1
0
    def __init__(self, cov, method='single', metric='euclidean'):
        """
        Combines the assets in `data` using HRP
        returns an object with the following attributes:
            - 'cov': covariance matrix of the returns
            - 'corr': correlation matrix of the returns
            - 'sort_ix': list of sorted column names according to cluster
            - 'link': linkage matrix of size (N-1)x4 with structure Y=[{y_m,1  y_m,2  y_m,3  y_m,4}_m=1,N-1].
                      At the i-th iteration, clusters with indices link[i, 0] and link[i, 1] are combined to form
                      cluster n+1. A cluster with an index less than n corresponds to one of the original observations.
                      The distance between clusters link[i, 0] and link[i, 1] is given by link[i, 2]. The fourth value
                      link[i, 3] represents the number of original observations in the newly formed cluster.
            - 'weights': final weights for each asset
        :param data: pandas DataFrame where each column is a series of returns
        :param method: any method available in scipy.cluster.hierarchy.linkage
        :param metric: any metric available in scipy.cluster.hierarchy.linkage
        """

        assert isinstance(
            cov, pd.DataFrame), "input 'cov' must be a pandas DataFrame"

        self.cov = cov
        self.corr, self.vols = cov2corr(cov)
        self.method = method
        self.metric = metric

        self.link = self._tree_clustering(self.corr, self.method, self.metric)
        self.sort_ix = self._get_quasi_diag(self.link)
        self.sort_ix = self.corr.index[self.sort_ix].tolist()  # recover labels
        self.sorted_corr = self.corr.loc[
            self.sort_ix, self.sort_ix]  # reorder correlation matrix
        self.weights = self._get_recursive_bisection(self.cov, self.sort_ix)
예제 #2
0
def random_correlation(size, n_factors, random_seed=None):
    """
    Generates a random correlation matrix with 'size' lines and columns and
    'n_factors' factors in the underlying structure of correlation.
    :param size: int. Size of the correlation matrix
    :param n_factors: int. number of factors in the correlation structure
    :param random_seed: int. random seed number
    :return: numpy.array. correlation matrix
    """
    cov = random_covariance(size, n_factors, random_seed)
    corr, _ = cov2corr(cov)
    return corr
예제 #3
0
def marchenko_pastur(df, bandwidth=0.1):
    """
    Uses the Marchenko-Pastur theorem to remove noisy eigenvalues from a correlation matrix.
    This code is adapted from Lopez de Prado (2020).
    @param df: pandas.DataFrame. Time series of returns.
    @param bandwidth: smoothing parameter for the KernelDensity estimation
    @return: 'corr' is the denoised correlation matrix, 'nFacts' is the number of non-random
             factors in the original correlation matrix and 'var' is the estimate of sigma**2,
             which can be interpreted as the % of noise in the original correlationm matrix.
    """

    emp_cov_matrix = df.dropna().cov()
    corr_matrix, vols = cov2corr(emp_cov_matrix)
    T, N = df.dropna().shape

    # get eigenvalues and eigenvectors
    eVal, eVec = np.linalg.eigh(corr_matrix)
    indices = eVal.argsort()[::-1]
    eVal, eVec = eVal[indices], eVec[:, indices]
    eVal = np.diagflat(eVal)

    # find sigma that minimizes the error to the Marchenko-Pastur distribution
    q = T / N
    eMax, var = _find_max_eigval(np.diag(eVal), q, bWidth=bandwidth)

    # number of factors (signals)
    nFacts = eVal.shape[0] - np.diag(eVal)[::-1].searchsorted(eMax)

    eVal_ = np.diag(eVal).copy()
    eVal_[nFacts:] = eVal_[nFacts:].sum() / float(eVal_.shape[0] - nFacts)
    eVal_ = np.diag(eVal_)
    cov = np.dot(eVec, eVal_).dot(eVec.T)

    corr, _ = cov2corr(cov)
    cov = corr2cov(corr, vols)

    cov = pd.DataFrame(data=cov, index=df.columns, columns=df.columns)

    return cov, nFacts, var
예제 #4
0
def targeted_shirinkage(df, bandwidth=0.1, ts_alpha=0.5):
    """
    Uses the Marchenko-Pastur theorem to find noisy eigenvalues from a correlation matrix and
    performs shrinkage only on the noisy part of the correlation matrix. This code is adapted
    from Lopez de Prado (2020).
    :param df: pandas.DataFrame. Time series of returns.
    :param bandwidth: smoothing parameter for the KernelDensity estimation
    :param ts_alpha: float. Number between 0 and 1 indicating the ammount of targeted shrinkage
                     on the random eigenvectors. ts_alpha=0 means total shrinkage and ts_alpha=1
                     means no shrinkage.
    :return: 'corr' is the denoised correlation matrix, 'nFacts' is the number of non-random
             factors in the original correlation matrix and 'var' is the estimate of sigma**2,
             which can be interpreted as the % of noise in the original correlationm matrix.
    """

    assert 0 <= ts_alpha <= 1, "'ts_alpha' must be between 0 and 1."

    cov_matrix = df.dropna().cov()
    corr_matrix, vols = cov2corr(cov_matrix)
    T, N = df.dropna().shape

    # get eigenvalues and eigenvectors
    eVal, eVec = np.linalg.eigh(corr_matrix)
    indices = eVal.argsort()[::-1]
    eVal, eVec = eVal[indices], eVec[:, indices]
    eVal = np.diagflat(eVal)

    # find sigma that minimizes the error to the Marchenko-Pastur distribution
    q = T / N
    eMax, var = _find_max_eigval(np.diag(eVal), q, bWidth=bandwidth)

    # number of factors (signals)
    nFacts = eVal.shape[0] - np.diag(eVal)[::-1].searchsorted(eMax)

    # targeted shrinkage
    eValL, eVecL = eVal[:nFacts, :nFacts], eVec[:, :nFacts]
    eValR, eVecR = eVal[nFacts:, nFacts:], eVec[:, nFacts:]
    corrL = np.dot(eVecL, eValL).dot(eVecL.T)
    corrR = np.dot(eVecR, eValR).dot(eVecR.T)
    corr = corrL + ts_alpha * corrR + (1 - ts_alpha) * np.diag(np.diag(corrR))

    cov = corr2cov(corr, vols)

    cov = pd.DataFrame(data=cov, index=df.columns, columns=df.columns)

    return cov, nFacts, var
예제 #5
0
def shrink_cov(df, alpha=0.1):
    """
    Applies shirinkage to the covariance matrix without changing the variance of each factor. This
    method differs from sklearn's method as this preserves the main diagonal of the covariance matrix,
    making this a more suitable method for financial data.
    :param df: pandas.DataFrame. Data frame with returns data
    :param alpha: float. A number between 0 and 1 that represents the shrinkage intensity.
    :return: numpy array. Shrunk Covariance matrix.
    """
    # TODO Example

    assert 0 <= alpha <= 1, "'alpha' must be between 0 and 1"

    cov = df.dropna().cov()
    vols = np.sqrt(np.diag(cov))
    corr, _ = cov2corr(cov)
    shrunk_corr = (1 - alpha) * corr + alpha * np.eye(corr.shape[0])
    shrunk_cov = np.diag(vols) @ shrunk_corr @ np.diag(vols)

    if isinstance(cov, pd.DataFrame):
        shrunk_cov = pd.DataFrame(data=shrunk_cov.values, index=cov.index, columns=cov.columns)

    return shrunk_cov
예제 #6
0
# Grab total return indexes
df = tracker_feeder()
df = df[df.index >= '2010-01-01']

# Grab funding series
sgs = SGS()
df_cdi = sgs.fetch({12: 'CDI'})
df_cdi = df_cdi / 100

# Compute ERIs
df_eri = compute_eri(total_return_index=df, funding_return=df_cdi['CDI'])
df_returns = df_eri.pct_change(1).dropna()

# Correlation
emp_cov = empirical_covariance(df_returns)
emp_corr, _ = cov2corr(emp_cov)
# print(emp_corr, '\n')

# Shirinkage
shrunk_cov = shrink_cov(df_returns, alpha=0.5)
shrunk_corr, _ = cov2corr(shrunk_cov)
# print(shrunk_corr, '\n')

# Marchenko-Pastur
mp_cov, _, _ = marchenko_pastur(df_returns)
mp_corr, _ = cov2corr(mp_cov)
# print(mp_corr, '\n')

# Targeted Shrinkage
ts_cov, _, _ = targeted_shirinkage(df_returns, ts_alpha=0.5)  # TODO deveria dar o MP?
ts_corr, _ = cov2corr(ts_cov)
예제 #7
0
        # optimization
        bl = BlackLitterman(
            sigma=mp_cov,
            estimation_error=1 / (21 * 3),
            views_p=P,
            views_v=v,
            w_equilibrium=weights_iv.loc[date].to_frame(),
            avg_risk_aversion=1.2,
            mu_historical=df_mom.loc[date].to_frame('Historical'),
            mu_shrink=mu_shrink,  # needs to be tuned
            overall_confidence=overall_confidence)  # needs to be tuned

        vol_bl = pd.Series(data=np.sqrt(np.diag(bl.sigma_bl)),
                           index=bl.sigma_bl.index)
        corr_bl = cov2corr(bl.sigma_bl)

        mkw = MaxSharpe(mu=bl.mu_bl,
                        sigma=vol_bl,
                        corr=corr_bl,
                        rf=(1 + df_libor.loc[date, 'US 3m LIBOR'])**0.25 - 1,
                        risk_aversion=1.2)

        weights_mpbl.loc[date] = mkw.risky_weights
        next_rebalance_date = date + pd.DateOffset(months=1)

next_rebalance_date = start_date
for date in tqdm(calendar, 'HRP'):
    if date >= next_rebalance_date:
        try:
            hrp = HRP(cov=df_cov.loc[date])
예제 #8
0
    def fit(self, fit_iter=100, n_states=None, max_state_number=8, select_iter=10):

        if n_states is None:
            self.n_states = self.select_order(max_state_number=max_state_number,
                                              select_iter=select_iter,
                                              show_chart=False)
        else:
            self.n_states = n_states

        # Estimate the model several times, due to instability, and grab the one with the highest score.
        model_dict = dict()
        for _ in tqdm(range(fit_iter), 'Estimating HMM'):
            model = hmm.GaussianHMM(n_components=self.n_states,
                                    covariance_type='full',
                                    n_iter=1000)
            model.fit(self.returns)
            model_dict[model.score(self.returns)] = model

        chosen_model = model_dict[max(model_dict.keys())]
        sort_order = np.flip(np.argsort(np.diag(chosen_model.transmat_)))

        # Build the sorted model
        sorted_model = hmm.GaussianHMM(n_components=self.n_states,
                                       covariance_type='full')

        sorted_model.startprob_ = chosen_model.startprob_[sort_order]
        sorted_model.transmat_ = pd.DataFrame(chosen_model.transmat_).loc[sort_order, sort_order].values
        sorted_model.means_ = chosen_model.means_[sort_order, :]
        sorted_model.covars_ = chosen_model.covars_[sort_order, :, :]

        try:
            column_labels = self.returns.columns
            time_index = self.returns.index
        except AttributeError:
            column_labels = [f'Asset {s + 1}' for s in range(self.n_var)]
            time_index = range(self.returns.shape[0])

        self.score = sorted_model.score(self.returns)

        self.trans_mat = pd.DataFrame(data=sorted_model.transmat_,
                                      index=[f'From State {s + 1}' for s in range(self.n_states)],
                                      columns=[f'To State {s + 1}' for s in range(self.n_states)])

        self.avg_duration = pd.Series(data=1 / (1 - np.diag(sorted_model.transmat_)),
                                      index=[f'State {s + 1}' for s in range(self.n_states)],
                                      name='Average Duration')

        self.stationary_dist = pd.Series(data=sorted_model.get_stationary_distribution(),
                                         index=[f'State {s + 1}' for s in range(self.n_states)],
                                         name='Stationary Distribution of States')

        self.means = pd.DataFrame(data=sorted_model.means_,
                                  index=[f'State {s + 1}' for s in range(self.n_states)],
                                  columns=column_labels)

        vol_data = [list(np.sqrt(np.diag(sorted_model.covars_[ss]))) for ss in range(self.n_states)]
        self.vols = pd.DataFrame(data=vol_data, columns=column_labels,
                                 index=[f'State {s + 1}' for s in range(self.n_states)])

        idx = pd.MultiIndex.from_product([[f'State {s + 1}' for s in range(self.n_states)],
                                          column_labels])
        self.covars = pd.DataFrame(index=idx, columns=column_labels,
                                   data=sorted_model.covars_.reshape(-1, self.n_var))

        corr_data = [cov2corr(sorted_model.covars_[ss])[0] for ss in range(self.n_states)]
        self.corrs = pd.DataFrame(index=idx, columns=column_labels,
                                  data=np.concatenate(corr_data))

        self.predicted_state = pd.Series(data=sorted_model.predict(self.returns) + 1,
                                         index=time_index,
                                         name='Predicted State')

        freq_data = ('State ' + self.predicted_state.astype(str)).value_counts() / self.predicted_state.count()
        self.state_freq = pd.Series(data=freq_data,
                                    index=[f'State {s + 1}' for s in range(self.n_states)],
                                    name='State Frequency')

        self.state_probs = pd.DataFrame(data=sorted_model.predict_proba(self.returns),
                                        index=time_index,
                                        columns=[f'State {s + 1}' for s in range(self.n_states)])