def _regress_(self, kind, obs, values, min_return=False, **kw):
        if not isinstance(obs, ndarray):
            obs = array(obs)
        if not isinstance(values, ndarray):
            values = array(values)

        X = sm.add_constant(obs)
        model = getattr(sm, kind)(values, X, **kw)
        return self._format_results_(model.fit(), min_return)
Exemplo n.º 2
0
def GLMResults(df, outcome, predictors, adj=[], logistic=True):
    if logistic:
        family = sm.families.Binomial()
        coefFunc = np.exp
        cols = ['OR', 'LL', 'UL', 'pvalue', 'Diff', 'N']
    else:
        family = sm.families.Gaussian()
        coefFunc = lambda x: x
        cols = ['Coef', 'LL', 'UL', 'pvalue', 'Diff', 'N']

    k = len(predictors)
    assoc = np.zeros((k, 6))
    params = []
    pvalues = []
    resObj = []
    for i, predc in enumerate(predictors):
        exogVars = list(set([predc] + adj))
        tmp = df[[outcome] + exogVars].dropna()

        model = sm.GLM(endog=tmp[outcome].astype(float),
                       exog=sm.add_constant(tmp[exogVars].astype(float)),
                       family=family)
        try:
            res = model.fit()
            assoc[i, 0] = coefFunc(res.params[predc])
            assoc[i, 3] = res.pvalues[predc]
            assoc[i, 1:3] = coefFunc(res.conf_int().loc[predc])
            assoc[i, 4] = tmp[predc].loc[tmp[outcome] == 1].mean(
            ) - tmp[predc].loc[tmp[outcome] == 0].mean()
            params.append(res.params.to_dict())
            pvalues.append(res.pvalues.to_dict())
            resObj.append(res)
        except sm.tools.sm_exceptions.PerfectSeparationError:
            assoc[i, 0] = np.nan
            assoc[i, 3] = 0
            assoc[i, 1:3] = [np.nan, np.nan]
            assoc[i, 4] = tmp[predc].loc[tmp[outcome] == 1].mean(
            ) - tmp[predc].loc[tmp[outcome] == 0].mean()
            params.append({k: np.nan for k in [predc] + adj})
            pvalues.append({k: np.nan for k in [predc] + adj})
            resObj.append(None)
            print('PerfectSeparationError: %s with %s' % (predc, outcome))
        assoc[i, 5] = tmp.shape[0]
    outDf = pd.DataFrame(assoc[:, :6], index=predictors, columns=cols)
    outDf['params'] = params
    outDf['pvalues'] = pvalues
    outDf['res'] = resObj
    return outDf
Exemplo n.º 3
0
mytime = MyPackage.MyClass_Time.MyClass_Time()  #时间类
myDA = MyPackage.MyClass_DataAnalysis.MyClass_DataAnalysis()  #数据分析类
#------------------------------------------------------------
Path = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\数据及源代码"
Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答"

#1.
import matplotlib.pyplot as plt
x = list(range(1952, 2016, 4))
y = (29.3, 28.8, 28.5, 28.4, 29.4, 27.6, 27.7, 27.7, 27.8, 27.4, 27.8, 27.1,
     27.3, 27.1, 27.0, 27.5)
plt.plot(x, y)
plt.show()

import statsmodels.api as sm
model = sm.OLS(y, sm.add_constant(x)).fit()
print(model.summary())
data = pd.DataFrame({"x": x, "y": y})
data
myDA.ols("y~x", data, True)

#2.
import pandas as pd
Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答"
EU = pd.read_csv(Path2 + '/Part2/005/EuStockMarkets.csv')

plt.plot(EU.DAX, EU.FTSE, '.')

plt.xlabel('DAX')

plt.ylabel('FTSE')
Exemplo n.º 4
0
df = pd.read_csv("data\world-happiness-report-2021.csv")

y_feature = ["Ladder score"] * 6
features = [
    "Logged GDP per capita", "Social support", "Healthy life expectancy",
    "Freedom to make life choices", "Generosity", "Perceptions of corruption"
]

dependent = df["Ladder score"]
independent = df[[
    "Logged GDP per capita", "Social support", "Healthy life expectancy",
    "Freedom to make life choices", "Generosity", "Perceptions of corruption"
]]

model = LinearRegression()

model.fit(independent, dependent)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

intercept = model.intercept_
coefficients = model.coef_

print("R2: ", model.score(independent, dependent))
print("Intercept: ", intercept)
print("coefficients: ", coefficients)

x = ssm.add_constant(independent)
model = ssm.OLS(dependent, independent).fit()
predictions = model.summary()
print(predictions)
Exemplo n.º 5
0
def diagnostic_plots(X, y, model_fit=None):
    """
    Function to reproduce the 4 base plots of an OLS model in R.
    https://robert-alvarez.github.io/2018-06-04-diagnostic_plots/
    ---
    Inputs:

    X: A numpy array or pandas dataframe of the features to use in building the linear regression model

    y: A numpy array or pandas series/dataframe of the target variable of the linear regression model

    model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be
                          generated from X, y
    """
    def _graph(formula, x_range, label=None):
        """
        Helper function for plotting cook's distance lines
        """
        x = x_range
        y = formula(x)
        plt.plot(x, y, label=label, lw=1, ls='--', color='red')

    if not model_fit:
        model_fit = sm.OLS(y, sm.add_constant(X)).fit()

    # create dataframe from X, y for easier plot handling
    dataframe = pd.concat([X, y], axis=1)

    # model values
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    plot_lm_1 = plt.figure()
    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      dataframe.columns[-1],
                                      data=dataframe,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')

    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]
    for i in abs_resid_top_3.index:
        plot_lm_1.axes[0].annotate(i,
                                   xy=(model_fitted_y[i], model_residuals[i]))

    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    plot_lm_3 = plt.figure()
    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_3.axes[0].set_title('Scale-Location')
    plot_lm_3.axes[0].set_xlabel('Fitted values')
    plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$')

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
    for i in abs_sq_norm_resid_top_3:
        plot_lm_3.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_norm_residuals_abs_sqrt[i]))

    plot_lm_4 = plt.figure()
    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_4.axes[0].set_xlim(0, max(model_leverage) + 0.01)
    plot_lm_4.axes[0].set_ylim(-3, 5)
    plot_lm_4.axes[0].set_title('Residuals vs Leverage')
    plot_lm_4.axes[0].set_xlabel('Leverage')
    plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
    for i in leverage_top_3:
        plot_lm_4.axes[0].annotate(i,
                                   xy=(model_leverage[i],
                                       model_norm_residuals[i]))

    p = len(model_fit.params)  # number of model parameters
    _graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
           np.linspace(0.001, max(model_leverage), 50),
           'Cook\'s distance')  # 0.5 line
    _graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
           np.linspace(0.001, max(model_leverage), 50))  # 1 line
    plot_lm_4.legend(loc='upper right')