示例#1
0
def test_linearity(x, y, n_knots=5, verbose=True):
    """Test linearity between two variables.

    Run a linear regression of y on x, and take the residuals.
    Fit the residuals with a natural spline with `n_knots` knots.
    Conduct a joint F-test for all columns in the natural spline basis matrix.

    Example:
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> x = np.linspace(0., 1., 101)
    >>> y = 5 * x + 3 + rng.random(size=101) / 5
    >>> test_linearity(x, y, n_knots=5, verbose=False)
    0.194032
    """
    residuals = OLS(y, add_constant(x)).fit().resid
    basis_matrix = patsy.dmatrix(
        f"cr(x, df={n_knots - 1}, constraints='center') - 1", {'x': x},
        return_type='dataframe')
    results = OLS(residuals, basis_matrix).fit()
    results.summary()
    nobs = results.nobs
    f_value = results.fvalue
    p_value = np.round(results.f_pvalue, 6)
    print('Test for Linearity: '
          f'N = {nobs:.0f}; df={nobs - n_knots - 1:.0f}; '
          f'F = {f_value:.3f}; p = {p_value:.6f}.')
    return p_value
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50, 6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = OLS(y, x).fit()
        print(regressor_OLS.summary())
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:, j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:, [0, j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print(regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
        else:
            break
    return x
示例#3
0
def prosperity_score_regression(cards,
                                metadata,
                                score_columns=score_column_names):
    """
    Perform a linear regression to determine the degree to which the
    Prosperity add-on treasure and victory cards contribute to a good
    score.
    """
    prosperity = set(cards['currency'].columns.get_level_values(1))
    # victory_cards = set(cards['victory'].columns.get_level_values(1))
    # cards = currency_cards.union(victory_cards)
    scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1)

    # Ignore missing cells
    refine_idx = np.isfinite(scores)
    scores = scores[refine_idx]

    set_counts = pd.concat([
        pd.DataFrame(cards.loc[refine_idx, pd.IndexSlice[:, :, c]].values,
                     columns=[c]) for c in prosperity
    ] + [
        pd.DataFrame(np.ones((scores.size, 1)), columns=['Average game score'])
    ],
                           axis=1).fillna(0)

    results = OLS(scores, set_counts).fit()
    print results.summary()
示例#4
0
def nuevo_regress():
    modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit()
    summary = modelo.summary()
    vals_residuales = modelo.resid
    print(summary)
    print(anderson(vals_residuales))
    grafica_qq(vals_residuales)
示例#5
0
def wrapper(dataset, req, wk):
    try:
        if (req == False):
            count = 1
            print("Features Available")
            for i in dataset.columns:
                print(str(count) + " " + str(i))
                count = count + 1
            while True:
                index = int(input("Mention Target Feature [Number]: "))
                if index < 1 or index > len(dataset.columns):
                    print("Index should be among the list only")
                else:
                    break
            X = get_dummies(dataset, drop_first=True)
            y = dataset[str(dataset.columns[index - 1])]
            model = OLS(endog=y, exog=X).fit()
            f = open(
                str(pwd) + "/Workspaces/" + str(wk) + "/summaryOLS.txt", "w+")
            f.write(str(model.summary()))
            f.close()
            print(model.summary())
            from numpy import array
            dt = model.pvalues.to_csv(str(pwd) + "/Workspaces/" + str(wk) +
                                      '/impfeatures.csv',
                                      header=True)
            dt = read_csv(
                str(pwd) + "/Workspaces/" + str(wk) + '/impfeatures.csv')
            dt = array(dt)
            print("Important Feature for Predicting Target Feature\n")
            f = open(
                str(pwd) + "/Workspaces/" + str(wk) + "/impfeatures.txt", "w+")
            f.write("Important Features")
            for i in dt:
                if i[1] < 0.05:
                    print(str(i[0]))
                    f.write(str(str(i[0]) + " " + str(i[1])) + "\n")
            f.close()
        else:
            print("Trained Model")
    except Exception as e:
        print(e)
        print("Error Occured in Wrapper")
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        print(regressor_OLS.summary())
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    return x
示例#7
0
def set_score_regression(cards, metadata, score_columns=score_column_names):
    """
    Perform a linear regression to determine the degree to which each
    game set's action cards contribute to a good score.
    """
    sets = set(cards['action'].columns.get_level_values(0))
    scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1)

    # Ignore missing cells
    refine_idx = np.isfinite(scores)
    scores = scores[refine_idx]

    set_counts = pd.concat([
        pd.DataFrame(np.sum(cards.loc[refine_idx, pd.IndexSlice[:, s, :]],
                            axis=1),
                     columns=[s]) for s in sets
    ],
                           axis=1).fillna(0)

    results = OLS(scores, set_counts).fit()
    print results.summary()
示例#8
0
def linear(X, y):
    """
    Multi-Linear Regression
    :param X: x variables
    :param y: y variable
    :return: linear regression model
    >>> db = pd.read_csv("Variables_for_analysis.csv")# doctest:+ELLIPSIS
    >>> pov = db["poverty"] / 100# doctest:+ELLIPSIS
    >>> un = db["unemployment"] / 100# doctest:+ELLIPSIS
    >>> edu = db["High school or higher"] / 100# doctest:+ELLIPSIS
    >>> xv1 = pd.DataFrame([edu, pov, un]).T# doctest:+ELLIPSIS
    >>> yv = pd.DataFrame(np.log10(db["rate"]))# doctest:+ELLIPSIS
    >>> linear(xv1, yv) # doctest:+ELLIPSIS
                                     OLS Regression Result...
    <statsmodels.regression.linear_model.RegressionResultsWrapper object at ...>

    """
    # Construct the multi-linear model
    ols = OLS(y, X).fit()
    print(ols.summary())
    return ols
示例#9
0
def compute_LR(feat_df, label_col='netRunTime'):
    model = OLS(feat_df[label_col],
                feat_df.loc[:,
                            feat_df.columns != label_col]).fit(cov_type='HC3')
    res = model.summary()
    return model, res
示例#10
0
print("p-value (corr. with num. deliveries):")
stats.pearsonr(num_deliveries, travel_time)[1]

# In[15]:

print("p-value (correlation with gas price):")
stats.pearsonr(gas_price, travel_time)[1]

# In[59]:

slope, intercept, r, p, stderr = stats.linregress(miles_traveled, travel_time)

# In[60]:

stderr

# In[62]:

plot_regression_line(miles_traveled, travel_time, 'r')

# In[88]:

b = OLS(miles_traveled, travel_time).fit()

# In[89]:

b.summary()

# In[ ]:
示例#11
0
predictors = list()
with open('temp_relevant_features_2.txt', 'r') as f:
    predictors = [line.strip() for line in f]

# print(predictors)
df = df[['Temp (°C)'] + predictors]

X = add_constant(df[predictors])
Y = df['Temp (°C)']
# print(x.ix[:5, :5])

alpha = 0.05

model = OLS(Y, X).fit()

results = model.summary()
results_as_html = results.tables[1].as_html()
results_df = read_html(results_as_html, header=0, index_col=0)[0].T
results_fit = False
while not results_fit:
    results_fit = True
    max_p_val = alpha
    max_key = None
    for x in tqdm(results_df.columns[1:]):
        # if results_df[x]['P>|t|'] > max_p_val:
        #     max_p_val = results_df[x]['P>|t|']
        #     max_key = x
        #     results_fit = False
        if results_df[x]['P>|t|'] > alpha:
            # max_p_val = results_df[x]['P>|t|']
            # max_key = x
示例#12
0
文件: code.py 项目: asarantsev/IDY
print('stdev of real earnings growth = ', np.std(growth))

IDY = TR[W:] - growth  #implied dividend yield
# cumulative implied dividend yield, after detrending it becomes heat measure
cumIDY = np.append(np.array([0]), np.cumsum(IDY))

# graphs of ACF and QQ for real earnings growth terms
plot_acf(growth)
plt.show()
qqplot(growth, line='s')
plt.show()

# main regression
DF = pd.DataFrame({'const': 1, 'trend': range(T - 1), 'Bubble': cumIDY[:-1]})
Regression = OLS(IDY, DF).fit()
print(Regression.summary())
coefficients = Regression.params
intercept = coefficients[0]
trendCoeff = coefficients[1]
heatCoeff = coefficients[2]
avgIDY = trendCoeff / abs(heatCoeff)
print('avgIDY = ', avgIDY)
avgHeat = (intercept - avgIDY) / abs(heatCoeff)
print('long-term average heat measure = ', avgHeat)

Heat = cumIDY - avgIDY * range(T)  #Heat measure
plt.figure(figsize=(7, 6))
plt.plot(range(NEW, LAST), Heat)
print('current heat measure = ', Heat[-1])
plt.title('Heat measure')
plt.show()
示例#13
0
    # 读取数据
    print(f"读取数据。。{vt_symbols}")
    df = load_portfolio_data(vt_symbols, start, end)
    print(df)

    ## 绘制两个标的原始价格图表
    run_plotly(vt_symbols)

    # 执行回归分析 最小二乘法  前面是y 后面是 x  y = ax + b
    # 使用np isnan 和isinf 来处理空值
    df[np.isnan(df)] = 0
    df[np.isinf(df)] = 0
    print(df)
    result = OLS(df[vt_symbols[0]], df[vt_symbols[-1]]).fit()
    print(result.summary())

    coef = 0.9994
    # 对残差绘图
    df["spread"] = df[vt_symbols[0]] - coef * df[vt_symbols[-1]]

    fig = go.Figure()
    line = go.Scatter(x=df.index, y=df["spread"], mode='lines', name="Spread")
    fig.add_trace(line)

    fig.show()

    # 执行协整检验
    # p-value如果小于0.05,则可以明确证明协整关系,但在实践中非常少见。
    # 价差整体上还是存在大量的均值偏移情况,但只要震荡回归的次数足够多,即使不满足协整也能通过交易盈利。
    score, pvalue, _ = coint(df[vt_symbols[0]], df[vt_symbols[-1]])
                         columns=['single_genre'],
                         prefix=['single_genre'])
print(movies1)
movies1.to_csv('regression.csv', index=False)

#print(movies1.columns)
# ['budget', 'genres', 'id', 'original_language', 'popularity',
#        'production_countries', 'release_date', 'revenue', 'runtime', 'status',
#        'title', 'vote_average', 'holiday', 'single_genre_Action',
#        'single_genre_Adventure', 'single_genre_Animation',
#        'single_genre_Comedy', 'single_genre_Crime', 'single_genre_Documentary',
#        'single_genre_Drama', 'single_genre_Family', 'single_genre_Fantasy',
#        'single_genre_Foreign', 'single_genre_History', 'single_genre_Horror',
#        'single_genre_Music', 'single_genre_Mystery', 'single_genre_Romance',
#        'single_genre_Science Fiction', 'single_genre_TV Movie',
#        'single_genre_Thriller', 'single_genre_War', 'single_genre_Western']

#Run regression analysis
dv = movies1['revenue']
iv = movies1[['budget']]

movies1_regression = OLS(dv.astype(float), iv.astype(float)).fit()
#Print your regression result
print(movies1_regression.summary())

print(movies1['budget'].describe())

#print(movies1['budget'].astype(int).describe())
#pd.set_option('display.max_columns', None)
#print(movies1.describe())
#print(homework_data['comp_sent_score'].describe())
data["proccessor"] = to_numeric(data["proccessor"])
data["proccessor_turbo"] = to_numeric(data["proccessor_turbo"])
#print(data.info())

x = data[["size", "proccessor", "proccessor_turbo", "ram", "hdd"]]
y = data["price"]

regr = linear_model.LinearRegression()
regr.fit(x, y)

print("Intercept: ", regr.intercept_)
print("Coeff: ", regr.coef_)
print("Score: ", regr.score(x, y))

new_size = 15.6
new_proccessor = 1.6
new_proccessor_turbo = 3.9
new_ram = 12
new_hdd = 1250

predicted = regr.predict(
    [[new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]])
print("Predicted: ", predicted)

x = add_constant(x)
model = OLS(y, x).fit()
predicted = model.predict(
    [[1, new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]])
print(model.summary())
 def get_coeffs_from_ols(self, a, b):
     model = OLS(a, b, missing='none', hasconst='yes').fit()
     model.summary()
示例#17
0
import numpy as np
from statsmodels.api import add_constant, OLS, WLS
import matplotlib.pyplot as plt


# (x, y) is the set of observations.  w contains precomputed weights; we'll
# also compute these weights in this script.
x, y, w = np.loadtxt('draper_smith_table9p1.txt', unpack=True)

X = add_constant(x, prepend=True)

# --- OLS ---------------------------------------------------------------
# Ordinary least squares fit.
ols_result = OLS(y, X).fit()

print ols_result.summary()

# Make a plot of the OLS residuals vs y and vs x.
# The following recreates Fig. 9.1.
plt.figure(1)
plt.clf()
plt.subplot(2, 1, 1)
plt.plot(ols_result.fittedvalues, ols_result.resid, 'bo')
plt.title("OLS Residuals versus fitted values")
plt.xlabel('y')
plt.ylabel('e')
plt.grid()
plt.subplot(2, 1, 2)
plt.plot(x, ols_result.resid, 'bo')
plt.title("OLS Residuals versus x")
plt.xlabel("x")
示例#18
0
def sumario(X, Y):
    '''Regressão mais completa da biblioteca statsmodels.api.'''
    X_sm = add_constant(X)
    results = OLS(Y, X_sm).fit()
    print(results.summary())
示例#19
0
backward_model = linearModel(nba_data)
backward_model.feature_selection('backward')
backward_model.predict()
print('Backward model R Squared value: ' + str(backward_model.r_squared))

# Pick the best model based on the R squared value
models = {
    'full model': model.r_squared,
    'forward_model': forward_model.r_squared,
    'backward_model': backward_model.r_squared
}
best = max(models, key=models.get)
print('Based on the R Squared metric, the ' + str(best) +
      ' is the best choice.')

# Print out the statistical summary for the best model
new_model = OLS(forward_model.Y_train, forward_model.X_train).fit()
print(new_model.summary())

# Add data column for predicted salaries from model:
nba_data['Predicted_Salary'] = nba_data['2P'] * new_model.params[0] + nba_data[
    'AST'] * new_model.params[1] + nba_data['BLK'] * new_model.params[2]
nba_data['Predicted_Salary']
nba_data['Salary_Residual'] = nba_data['salary_float'] - nba_data[
    'Predicted_Salary']

#NBA data sorted in order of most undervalued players:
nba_data.sort_values('Salary_Residual', ascending=True)

#NBA data sorted in order of most overvalued players:
nba_data.sort_values('Salary_Residual', ascending=False)
示例#20
0
mean["Month"] = date.map(lambda x: x.month)

mean_long = mean.pivot(
    index="Year",
    columns="Month"
)
mean_long.columns = range(1, 13)



import patsy
from statsmodels.api import OLS

y, X = patsy.dmatrices("Mean ~ bs(Year, 5) + bs(Month, 5)", data=mean)
model = OLS(y, X).fit()
model.summary()




mean["Pred"] = model.predict()

mean.columns = ['Mean', 'Year', 'Month', 'Fitted mean']

m_long = mean.pivot(index="Month", columns="Year", values="Mean")
d_long = mean.reset_index().pivot(index="Month", columns="Year", values="index")


color = plt.cm.coolwarm(np.linspace(0.1, 0.9, 12))
mpl.rcParams['axes.prop_cycle'] = cycler.cycler('color', color)
# 2. double sample: Perform double sampling
n = 67
n_i = 2*n # this is n'
phase_1_sample = stock.sample(n=n_i)
phase_2_sample = phase_1_sample.sample(n=n)
print('First Phase Sample:',phase_1_sample.head())
print('Second Phase Sample:',phase_2_sample.head())

# 3. Regression analysis x~y -> vwap ~ changeOverTime: Perform a diagnostic analysis to determine if x and y have a linear
# relationship and fitted line goes through the origin based on the sample data. Do regression analysis y ∼ x.

from statsmodels.api import OLS
x = phase_2_sample[['vwap']]
y = phase_2_sample[['changeOverTime']]
reg = OLS(y, x).fit()
reg.summary()

yi_sum = phase_2_sample['changeOverTime'].sum()
yi_sum
xi_sum = phase_2_sample['vwap'].sum()
xi_sum
r = yi_sum / xi_sum
r
print('ratio estimator (r) =', r)

# 5. estimate your parameter of interest by ratio estimator: Estimate your parameter of interest by
# ratio estimator. Estimate its variance and standard deviation.
N = 368
t_hat_x = N/n_i*xi_sum
t_hat_x
t_hat_r = r*t_hat_x
示例#22
0
LogStarAge = np.array([
    1.58103844, 1.06471074, 2.39789527, 0.72754861, 0.55675456, 1.91692261,
    1.64865863, 1.38629436, 0.77472717, 1.36097655, 0., 1.80828877, 1.7837273,
    0.64185389, 0.69813472, 2.39789527, -0.35667494, 1.79175947, 1.90210753,
    1.39624469, 1.84054963, 2.19722458, 1.89761986, 1.84054963, 0.74193734,
    0.55961579, 1.79175947, 0.91629073, 2.17475172, 1.36097655
])

N = 30

intercept = np.ones(30)
# ,columns=['interc','radius','orbit','metal','mass','age']
df = pd.DataFrame({
    "intercept": intercept,
    "radius": LogPlanetRadius,
    "orbit": LogPlanetOrbit,
    "metal": StarMetallicity,
    "mass": LogStarMass,
    "age": LogStarAge
})
y = pd.DataFrame([LogPlanetMass])

regr = linear_model.LinearRegression()
regr.fit(df, LogPlanetMass)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

regols = OLS(LogPlanetMass, df).fit()
regols.summary()
# [0.15379, 1.402, -0.140, -1.5995, -0.956, -0.4617]
sns.heatmap(pearson_coefficeint,annot=True) 
#REGPLOT for highly correlated variables
sns.regplot(x= "bid_price",y="ask_price", data = dfquote_s)

#%%
'''REGRESSION ANALYSIS'''
x = dfquote[['bid_price','bid_size']]
y = dfquote['ask_price']
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)
LR=LinearRegression()
LR.fit(x_train, y_train)
y_pred=LR.predict(x_test)
r2_score(y_test,y_pred)
#0.9994800886579274
mod1=OLS(y,x).fit()
print(mod1.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:              ask_price   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.848e+09
Date:                Fri, 25 Sep 2020   Prob (F-statistic):                        0.00
Time:                        15:26:32   Log-Likelihood:                     -8.0572e+06
No. Observations:             2158864   AIC:                                  1.611e+07
Df Residuals:                 2158862   BIC:                                  1.611e+07
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
bid_price      1.0124   1.35e-05   7.52e+04      0.000       1.012       1.012
示例#24
0

mod = smf.ols(formula='np.log(ADCResponse) ~ np.log(Intensity) + np.log(IntensitySD)*C(Z)', 
    data=lo)
res = mod.fit()
res.summary()

# 666 the number of the beast! Hell and fire were spawn to be released!

plt.scatter(np.log(y), res.fittedvalues - np.log(y))
plt.show()


plt.scatter(y[1:], y[:-1])
plt.show()

lm = OLS(y, X).fit()
lm.summary()
X.columns

X['logI'] = np.log(X.Intensity)
Counter(X.Z)

lm = OLS(np.log(y), X[['Mass','logI']]).fit()
lm.summary()

plt.scatter(np.log(y), lm.fittedvalues-np.log(y))
plt.show()