示例#1
0
def rmse(data, yvar, xvars):
    raise NotImplementedError('No dsf support yet.')
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return pd.Series({'ivol':sqrt(result.mse_resid), 'n':result.nobs})
示例#2
0
def get_beta(values1, values2):
    # http://statsmodels.sourceforge.net/stable/regression.html
    model = sm.OLS(values1, values2)
    results = model.fit()
    return results.params[0]
    value1 = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
    value2 = [1.75, 2.45, 3.81, 4.80, 7.00, 8.60]
    print(get_beta(value1, value2))
    def fit(self, data_frame: DataFrame):
        self.data_frame = data_frame

        self.source_matrix = np.vstack([
            data_frame.temperature**i - data_frame.reference_temperature**i
            for i in [-1, 1, 2]
        ]).T

        self.aux_fit = statsmodels.OLS(data_frame.experiment,
                                       self.source_matrix).fit()
        self.fit_coefficients = self.aux_fit.params

        self.fit = np.dot(self.source_matrix, self.fit_coefficients)

        self.heat_capacity_matrix = np.vstack(
            [i * data_frame.temperature**(i - 1) for i in [-1, 1, 2]]).T
        self.fit_heat_capacity = np.dot(self.heat_capacity_matrix,
                                        self.fit_coefficients)
示例#4
0
mytime = MyPackage.MyClass_Time.MyClass_Time()  #时间类
myDA = MyPackage.MyClass_DataAnalysis.MyClass_DataAnalysis()  #数据分析类
#------------------------------------------------------------
Path = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\数据及源代码"
Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答"

#1.
import matplotlib.pyplot as plt
x = list(range(1952, 2016, 4))
y = (29.3, 28.8, 28.5, 28.4, 29.4, 27.6, 27.7, 27.7, 27.8, 27.4, 27.8, 27.1,
     27.3, 27.1, 27.0, 27.5)
plt.plot(x, y)
plt.show()

import statsmodels.api as sm
model = sm.OLS(y, sm.add_constant(x)).fit()
print(model.summary())
data = pd.DataFrame({"x": x, "y": y})
data
myDA.ols("y~x", data, True)

#2.
import pandas as pd
Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答"
EU = pd.read_csv(Path2 + '/Part2/005/EuStockMarkets.csv')

plt.plot(EU.DAX, EU.FTSE, '.')

plt.xlabel('DAX')

plt.ylabel('FTSE')
示例#5
0
def qqPlot(array):
    modelFit = statsmodels.OLS(array).fit
    residuals = modelFit.resid
    fig = statsmodels.qqplot(residuals)
    plt.show()
示例#6
0
def durbinWatson(array):
    modelFit = statsmodels.OLS(array).fit
    residuals = modelFit.resid
    statsmodels.stats.stattools.durbin_watson(residuals, axis=0)
示例#7
0
df = pd.read_csv("data\world-happiness-report-2021.csv")

y_feature = ["Ladder score"] * 6
features = [
    "Logged GDP per capita", "Social support", "Healthy life expectancy",
    "Freedom to make life choices", "Generosity", "Perceptions of corruption"
]

dependent = df["Ladder score"]
independent = df[[
    "Logged GDP per capita", "Social support", "Healthy life expectancy",
    "Freedom to make life choices", "Generosity", "Perceptions of corruption"
]]

model = LinearRegression()

model.fit(independent, dependent)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

intercept = model.intercept_
coefficients = model.coef_

print("R2: ", model.score(independent, dependent))
print("Intercept: ", intercept)
print("coefficients: ", coefficients)

x = ssm.add_constant(independent)
model = ssm.OLS(dependent, independent).fit()
predictions = model.summary()
print(predictions)
示例#8
0
make_cats = ['BUICK', 'CADILLAC', 'CHEVROLET', 'CHRYSLER', 'DODGE', 'FORD', 'GMC', 'HONDA', 'HUMMER', 'HYUNDAI', 'INFINITI', 'ISUZU', 'JEEP', 'KIA', 'LEXUS', 'LINCOLN', 'MAZDA', 'MERCURY', 'MINI', 'MITSUBISHI', 'NISSAN', 'OLDSMOBILE', 'PLYMOUTH', 'PONTIAC', 'SATURN', 'SCION', 'SUBARU', 'SUZUKI', 'TOYOTA', 'TOYOTA SCION', 'VOLKSWAGEN', 'VOLVO']
state_cats = ['AR', 'AZ', 'CA', 'CO', 'FL', 'GA', 'IA', 'ID', 'IL', 'IN', 'KY', 'LA', 'MA', 'MD', 'MI', 'MN', 'MO', 'MS', 'NC', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'SC', 'TN', 'TX', 'UT', 'VA', 'WA', 'WV']
auction_cats = ['ADESA', 'MANHEIM', 'OTHER']
trans_cats = ['AUTO']
color_cats = ['BEIGE', 'BLACK', 'BLUE', 'BROWN', 'GOLD', 'GREEN', 'GREY', 'MAROON', 'NOT AVAIL', 'ORANGE', 'OTHER', 'PURPLE', 'RED', 'SILVER', 'WHITE', 'YELLOW']
wheel_cats = ['Alloy', 'Covers', 'Special']
nat_cats = ['AMERICAN', 'OTHER', 'OTHER ASIAN', 'TOP LINE ASIAN']
size_cats =['COMPACT', 'CROSSOVER', 'LARGE', 'LARGE SUV', 'LARGE TRUCK', 'MEDIUM', 'MEDIUM SUV', 'SMALL SUV', 'SMALL TRUCK', 'SPECIALTY', 'SPORTS', 'VAN']
year_cats = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010_x']

all_vars = [continuous_vars, make_cats, state_cats, auction_cats, trans_cats, color_cats, wheel_cats, nat_cats, size_cats, year_cats]

## OLS ## 
f = 'IsBadBuy ~ all_vars' 
y,X = patsy.dmatrices(f, training_clean, return_type='dataframe')
print smf.OLS(y,X).fit().summary()

f = 'IsBadBuy ~ continuous_vars' 
y,X = patsy.dmatrices(f, training, return_type='dataframe')
print smf.OLS(y,X).fit().summary()

continuous_vars = ['VehicleAge','VehOdo','VehBCost','discount']

tempset = training[training.MMRAcquisitionAuctionAveragePrice.notnull()]
#define new variables
tempset['discount'] = tempset['VehBCost']-tempset['MMRAcquisitionAuctionAveragePrice']


regression_OLS=sm.OLS(tempset.IsBadBuy, tempset[continuous_vars],hasconst=False).fit()

print regression_OLS.summary()
示例#9
0
#summary(anova1)

# better option - "aov"
#anova2 <- aov(Fare ~ Embarked, data = rTrain)
#anova2
#summary(anova2)
# ====== end of R Code ==========

#==============================================================================
# Multiple Linear Regression
#==============================================================================
# Assumptions... and what do they mean?
Rpdata = pd.read_csv(os.path.join(rawDataFld, "alr4_Rpdata.csv"))

import statsmodels.api as sm
import seaborn as sns

lmmodel = sm.OLS(Rpdata["y"], Rpdata[["x1", "x2", "x3", "x4", "x5",
                                      "x6"]]).fit()

# does this look like a good model (numerically speaking)?
lmmodel.summary()

# how about we "dig deeper"? - residual plots
lmmodel_fitted_y = lmmodel.fittedvalues
lmmodel_residuals = lmmodel.resid
sns.residplot(lmmodel_fitted_y, 'y', data=Rpdata, lowess=True)
# Uh-oh! The residuals are not random! Or are they? What makes them non-random?
# Participate in the online discussion on randomness!
# HW follow up with comments on what is randomness, and how is this relevant to our analysis?
示例#10
0
    mean_fake.append(mean_tau)

    plt.scatter(this_unit['tau'], mean_tau)

mean_fake = np.array(mean_fake).reshape(-1, 1)
mean_real = np.array(mean_real).reshape(-1, 1)

regr = linear_model.LinearRegression()
regr.fit(mean_real, mean_fake)
fake_tau_pred = regr.predict(mean_real)

r2 = r2_score(mean_fake, fake_tau_pred)

import statsmodels.api as sm
mod = sm.OLS(mean_fake, mean_real)
fii = mod.fit()
p_values = fii.summary2().tables[1]['P>|t|']

plt.plot(mean_real, fake_tau_pred, color='#00b2ee', label='regression')

plt.plot(range(0, 1000),
         range(0, 1000),
         '-',
         color='#f00c93',
         label='identity')
plt.title('Monkey amygdala \n 1000 iterations')
plt.xlabel('"real" tau (ms)')
plt.ylabel('"fake" tau (ms)')
plt.text(0, 800, '$R^2 = %.2f $ \n $p < 10^{-30}$' % (r2))
plt.legend()
示例#11
0
def diagnostic_plots(X, y, model_fit=None):
    """
    Function to reproduce the 4 base plots of an OLS model in R.
    https://robert-alvarez.github.io/2018-06-04-diagnostic_plots/
    ---
    Inputs:

    X: A numpy array or pandas dataframe of the features to use in building the linear regression model

    y: A numpy array or pandas series/dataframe of the target variable of the linear regression model

    model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be
                          generated from X, y
    """
    def _graph(formula, x_range, label=None):
        """
        Helper function for plotting cook's distance lines
        """
        x = x_range
        y = formula(x)
        plt.plot(x, y, label=label, lw=1, ls='--', color='red')

    if not model_fit:
        model_fit = sm.OLS(y, sm.add_constant(X)).fit()

    # create dataframe from X, y for easier plot handling
    dataframe = pd.concat([X, y], axis=1)

    # model values
    model_fitted_y = model_fit.fittedvalues
    # model residuals
    model_residuals = model_fit.resid
    # normalized residuals
    model_norm_residuals = model_fit.get_influence().resid_studentized_internal
    # absolute squared normalized residuals
    model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
    # absolute residuals
    model_abs_resid = np.abs(model_residuals)
    # leverage, from statsmodels internals
    model_leverage = model_fit.get_influence().hat_matrix_diag
    # cook's distance, from statsmodels internals
    model_cooks = model_fit.get_influence().cooks_distance[0]

    plot_lm_1 = plt.figure()
    plot_lm_1.axes[0] = sns.residplot(model_fitted_y,
                                      dataframe.columns[-1],
                                      data=dataframe,
                                      lowess=True,
                                      scatter_kws={'alpha': 0.5},
                                      line_kws={
                                          'color': 'red',
                                          'lw': 1,
                                          'alpha': 0.8
                                      })

    plot_lm_1.axes[0].set_title('Residuals vs Fitted')
    plot_lm_1.axes[0].set_xlabel('Fitted values')
    plot_lm_1.axes[0].set_ylabel('Residuals')

    # annotations
    abs_resid = model_abs_resid.sort_values(ascending=False)
    abs_resid_top_3 = abs_resid[:3]
    for i in abs_resid_top_3.index:
        plot_lm_1.axes[0].annotate(i,
                                   xy=(model_fitted_y[i], model_residuals[i]))

    QQ = ProbPlot(model_norm_residuals)
    plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)
    plot_lm_2.axes[0].set_title('Normal Q-Q')
    plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
    plot_lm_2.axes[0].set_ylabel('Standardized Residuals')
    # annotations
    abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
    abs_norm_resid_top_3 = abs_norm_resid[:3]
    for r, i in enumerate(abs_norm_resid_top_3):
        plot_lm_2.axes[0].annotate(i,
                                   xy=(np.flip(QQ.theoretical_quantiles,
                                               0)[r], model_norm_residuals[i]))

    plot_lm_3 = plt.figure()
    plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5)
    sns.regplot(model_fitted_y,
                model_norm_residuals_abs_sqrt,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_3.axes[0].set_title('Scale-Location')
    plot_lm_3.axes[0].set_xlabel('Fitted values')
    plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$')

    # annotations
    abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0)
    abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
    for i in abs_sq_norm_resid_top_3:
        plot_lm_3.axes[0].annotate(i,
                                   xy=(model_fitted_y[i],
                                       model_norm_residuals_abs_sqrt[i]))

    plot_lm_4 = plt.figure()
    plt.scatter(model_leverage, model_norm_residuals, alpha=0.5)
    sns.regplot(model_leverage,
                model_norm_residuals,
                scatter=False,
                ci=False,
                lowess=True,
                line_kws={
                    'color': 'red',
                    'lw': 1,
                    'alpha': 0.8
                })
    plot_lm_4.axes[0].set_xlim(0, max(model_leverage) + 0.01)
    plot_lm_4.axes[0].set_ylim(-3, 5)
    plot_lm_4.axes[0].set_title('Residuals vs Leverage')
    plot_lm_4.axes[0].set_xlabel('Leverage')
    plot_lm_4.axes[0].set_ylabel('Standardized Residuals')

    # annotations
    leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
    for i in leverage_top_3:
        plot_lm_4.axes[0].annotate(i,
                                   xy=(model_leverage[i],
                                       model_norm_residuals[i]))

    p = len(model_fit.params)  # number of model parameters
    _graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
           np.linspace(0.001, max(model_leverage), 50),
           'Cook\'s distance')  # 0.5 line
    _graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
           np.linspace(0.001, max(model_leverage), 50))  # 1 line
    plot_lm_4.legend(loc='upper right')
(grp2_df_rev.corr()).iloc[:,0]


plt.plot(grp2_df_rev['meter_reading'])
plt.plot(grp2_df_rev['wind_direction'])
plt.legend(['meter_reading','wind_direction'])
plt.show()



import statsmodels.api as sm
import statsmodels


# OK THERE IS NO TREND HERE  (ON BOTH SERIES )
mod1_grp2=sm.OLS(np.array(grp2_df_rev['meter_reading']['mean']),statsmodels.tools.tools.add_constant(np.array(grp2_df_rev['wind_direction']['mean'])),hasconst=False)
mod1_grp2.fit().summary()
results=mod1_grp2.fit()
plt.hist(results.resid)
plt.show()

adfuller(results.resid)

# OBVIOUSLY RESIDUALS WITHOUT TREND



mod2_grp2=sm.OLS(np.array(grp2_df_rev['meter_reading']['mean']),np.array(grp2_df_rev['wind_direction']['mean']),hasconst=False)
mod2_grp2.fit().summary()
results2=mod2_grp2.fit()
plt.hist(results2.resid)
示例#13
0
# Use the "statsmodels" package of econometric commands with "sm" shortcut
import statsmodels as sm

# Define a model regressing y on X using OLS from the sm module
ols_model = sm.OLS(y, X)

# Estimate the model using the .fit attribute
ols_results = ols_model.fit()

# Report the results to the terminal
print(ols_results.summary())