def test_linearity(x, y, n_knots=5, verbose=True): """Test linearity between two variables. Run a linear regression of y on x, and take the residuals. Fit the residuals with a natural spline with `n_knots` knots. Conduct a joint F-test for all columns in the natural spline basis matrix. Example: >>> import numpy as np >>> rng = np.random.default_rng(0) >>> x = np.linspace(0., 1., 101) >>> y = 5 * x + 3 + rng.random(size=101) / 5 >>> test_linearity(x, y, n_knots=5, verbose=False) 0.194032 """ residuals = OLS(y, add_constant(x)).fit().resid basis_matrix = patsy.dmatrix( f"cr(x, df={n_knots - 1}, constraints='center') - 1", {'x': x}, return_type='dataframe') results = OLS(residuals, basis_matrix).fit() results.summary() nobs = results.nobs f_value = results.fvalue p_value = np.round(results.f_pvalue, 6) print('Test for Linearity: ' f'N = {nobs:.0f}; df={nobs - n_knots - 1:.0f}; ' f'F = {f_value:.3f}; p = {p_value:.6f}.') return p_value
def backwardElimination(x, SL): numVars = len(x[0]) temp = np.zeros((50, 6)).astype(int) for i in range(0, numVars): regressor_OLS = OLS(y, x).fit() print(regressor_OLS.summary()) maxVar = max(regressor_OLS.pvalues).astype(float) adjR_before = regressor_OLS.rsquared_adj.astype(float) if maxVar > SL: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): temp[:, j] = x[:, j] x = np.delete(x, j, 1) tmp_regressor = OLS(y, x).fit() adjR_after = tmp_regressor.rsquared_adj.astype(float) if (adjR_before >= adjR_after): x_rollback = np.hstack((x, temp[:, [0, j]])) x_rollback = np.delete(x_rollback, j, 1) print(regressor_OLS.summary()) return x_rollback else: continue else: break return x
def prosperity_score_regression(cards, metadata, score_columns=score_column_names): """ Perform a linear regression to determine the degree to which the Prosperity add-on treasure and victory cards contribute to a good score. """ prosperity = set(cards['currency'].columns.get_level_values(1)) # victory_cards = set(cards['victory'].columns.get_level_values(1)) # cards = currency_cards.union(victory_cards) scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1) # Ignore missing cells refine_idx = np.isfinite(scores) scores = scores[refine_idx] set_counts = pd.concat([ pd.DataFrame(cards.loc[refine_idx, pd.IndexSlice[:, :, c]].values, columns=[c]) for c in prosperity ] + [ pd.DataFrame(np.ones((scores.size, 1)), columns=['Average game score']) ], axis=1).fillna(0) results = OLS(scores, set_counts).fit() print results.summary()
def nuevo_regress(): modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit() summary = modelo.summary() vals_residuales = modelo.resid print(summary) print(anderson(vals_residuales)) grafica_qq(vals_residuales)
def wrapper(dataset, req, wk): try: if (req == False): count = 1 print("Features Available") for i in dataset.columns: print(str(count) + " " + str(i)) count = count + 1 while True: index = int(input("Mention Target Feature [Number]: ")) if index < 1 or index > len(dataset.columns): print("Index should be among the list only") else: break X = get_dummies(dataset, drop_first=True) y = dataset[str(dataset.columns[index - 1])] model = OLS(endog=y, exog=X).fit() f = open( str(pwd) + "/Workspaces/" + str(wk) + "/summaryOLS.txt", "w+") f.write(str(model.summary())) f.close() print(model.summary()) from numpy import array dt = model.pvalues.to_csv(str(pwd) + "/Workspaces/" + str(wk) + '/impfeatures.csv', header=True) dt = read_csv( str(pwd) + "/Workspaces/" + str(wk) + '/impfeatures.csv') dt = array(dt) print("Important Feature for Predicting Target Feature\n") f = open( str(pwd) + "/Workspaces/" + str(wk) + "/impfeatures.txt", "w+") f.write("Important Features") for i in dt: if i[1] < 0.05: print(str(i[0])) f.write(str(str(i[0]) + " " + str(i[1])) + "\n") f.close() else: print("Trained Model") except Exception as e: print(e) print("Error Occured in Wrapper")
def backwardElimination(x, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = OLS(y, x).fit() maxVar = max(regressor_OLS.pvalues).astype(float) print(regressor_OLS.summary()) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) return x
def set_score_regression(cards, metadata, score_columns=score_column_names): """ Perform a linear regression to determine the degree to which each game set's action cards contribute to a good score. """ sets = set(cards['action'].columns.get_level_values(0)) scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1) # Ignore missing cells refine_idx = np.isfinite(scores) scores = scores[refine_idx] set_counts = pd.concat([ pd.DataFrame(np.sum(cards.loc[refine_idx, pd.IndexSlice[:, s, :]], axis=1), columns=[s]) for s in sets ], axis=1).fillna(0) results = OLS(scores, set_counts).fit() print results.summary()
def linear(X, y): """ Multi-Linear Regression :param X: x variables :param y: y variable :return: linear regression model >>> db = pd.read_csv("Variables_for_analysis.csv")# doctest:+ELLIPSIS >>> pov = db["poverty"] / 100# doctest:+ELLIPSIS >>> un = db["unemployment"] / 100# doctest:+ELLIPSIS >>> edu = db["High school or higher"] / 100# doctest:+ELLIPSIS >>> xv1 = pd.DataFrame([edu, pov, un]).T# doctest:+ELLIPSIS >>> yv = pd.DataFrame(np.log10(db["rate"]))# doctest:+ELLIPSIS >>> linear(xv1, yv) # doctest:+ELLIPSIS OLS Regression Result... <statsmodels.regression.linear_model.RegressionResultsWrapper object at ...> """ # Construct the multi-linear model ols = OLS(y, X).fit() print(ols.summary()) return ols
def compute_LR(feat_df, label_col='netRunTime'): model = OLS(feat_df[label_col], feat_df.loc[:, feat_df.columns != label_col]).fit(cov_type='HC3') res = model.summary() return model, res
print("p-value (corr. with num. deliveries):") stats.pearsonr(num_deliveries, travel_time)[1] # In[15]: print("p-value (correlation with gas price):") stats.pearsonr(gas_price, travel_time)[1] # In[59]: slope, intercept, r, p, stderr = stats.linregress(miles_traveled, travel_time) # In[60]: stderr # In[62]: plot_regression_line(miles_traveled, travel_time, 'r') # In[88]: b = OLS(miles_traveled, travel_time).fit() # In[89]: b.summary() # In[ ]:
predictors = list() with open('temp_relevant_features_2.txt', 'r') as f: predictors = [line.strip() for line in f] # print(predictors) df = df[['Temp (°C)'] + predictors] X = add_constant(df[predictors]) Y = df['Temp (°C)'] # print(x.ix[:5, :5]) alpha = 0.05 model = OLS(Y, X).fit() results = model.summary() results_as_html = results.tables[1].as_html() results_df = read_html(results_as_html, header=0, index_col=0)[0].T results_fit = False while not results_fit: results_fit = True max_p_val = alpha max_key = None for x in tqdm(results_df.columns[1:]): # if results_df[x]['P>|t|'] > max_p_val: # max_p_val = results_df[x]['P>|t|'] # max_key = x # results_fit = False if results_df[x]['P>|t|'] > alpha: # max_p_val = results_df[x]['P>|t|'] # max_key = x
print('stdev of real earnings growth = ', np.std(growth)) IDY = TR[W:] - growth #implied dividend yield # cumulative implied dividend yield, after detrending it becomes heat measure cumIDY = np.append(np.array([0]), np.cumsum(IDY)) # graphs of ACF and QQ for real earnings growth terms plot_acf(growth) plt.show() qqplot(growth, line='s') plt.show() # main regression DF = pd.DataFrame({'const': 1, 'trend': range(T - 1), 'Bubble': cumIDY[:-1]}) Regression = OLS(IDY, DF).fit() print(Regression.summary()) coefficients = Regression.params intercept = coefficients[0] trendCoeff = coefficients[1] heatCoeff = coefficients[2] avgIDY = trendCoeff / abs(heatCoeff) print('avgIDY = ', avgIDY) avgHeat = (intercept - avgIDY) / abs(heatCoeff) print('long-term average heat measure = ', avgHeat) Heat = cumIDY - avgIDY * range(T) #Heat measure plt.figure(figsize=(7, 6)) plt.plot(range(NEW, LAST), Heat) print('current heat measure = ', Heat[-1]) plt.title('Heat measure') plt.show()
# 读取数据 print(f"读取数据。。{vt_symbols}") df = load_portfolio_data(vt_symbols, start, end) print(df) ## 绘制两个标的原始价格图表 run_plotly(vt_symbols) # 执行回归分析 最小二乘法 前面是y 后面是 x y = ax + b # 使用np isnan 和isinf 来处理空值 df[np.isnan(df)] = 0 df[np.isinf(df)] = 0 print(df) result = OLS(df[vt_symbols[0]], df[vt_symbols[-1]]).fit() print(result.summary()) coef = 0.9994 # 对残差绘图 df["spread"] = df[vt_symbols[0]] - coef * df[vt_symbols[-1]] fig = go.Figure() line = go.Scatter(x=df.index, y=df["spread"], mode='lines', name="Spread") fig.add_trace(line) fig.show() # 执行协整检验 # p-value如果小于0.05,则可以明确证明协整关系,但在实践中非常少见。 # 价差整体上还是存在大量的均值偏移情况,但只要震荡回归的次数足够多,即使不满足协整也能通过交易盈利。 score, pvalue, _ = coint(df[vt_symbols[0]], df[vt_symbols[-1]])
columns=['single_genre'], prefix=['single_genre']) print(movies1) movies1.to_csv('regression.csv', index=False) #print(movies1.columns) # ['budget', 'genres', 'id', 'original_language', 'popularity', # 'production_countries', 'release_date', 'revenue', 'runtime', 'status', # 'title', 'vote_average', 'holiday', 'single_genre_Action', # 'single_genre_Adventure', 'single_genre_Animation', # 'single_genre_Comedy', 'single_genre_Crime', 'single_genre_Documentary', # 'single_genre_Drama', 'single_genre_Family', 'single_genre_Fantasy', # 'single_genre_Foreign', 'single_genre_History', 'single_genre_Horror', # 'single_genre_Music', 'single_genre_Mystery', 'single_genre_Romance', # 'single_genre_Science Fiction', 'single_genre_TV Movie', # 'single_genre_Thriller', 'single_genre_War', 'single_genre_Western'] #Run regression analysis dv = movies1['revenue'] iv = movies1[['budget']] movies1_regression = OLS(dv.astype(float), iv.astype(float)).fit() #Print your regression result print(movies1_regression.summary()) print(movies1['budget'].describe()) #print(movies1['budget'].astype(int).describe()) #pd.set_option('display.max_columns', None) #print(movies1.describe()) #print(homework_data['comp_sent_score'].describe())
data["proccessor"] = to_numeric(data["proccessor"]) data["proccessor_turbo"] = to_numeric(data["proccessor_turbo"]) #print(data.info()) x = data[["size", "proccessor", "proccessor_turbo", "ram", "hdd"]] y = data["price"] regr = linear_model.LinearRegression() regr.fit(x, y) print("Intercept: ", regr.intercept_) print("Coeff: ", regr.coef_) print("Score: ", regr.score(x, y)) new_size = 15.6 new_proccessor = 1.6 new_proccessor_turbo = 3.9 new_ram = 12 new_hdd = 1250 predicted = regr.predict( [[new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]]) print("Predicted: ", predicted) x = add_constant(x) model = OLS(y, x).fit() predicted = model.predict( [[1, new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]]) print(model.summary())
def get_coeffs_from_ols(self, a, b): model = OLS(a, b, missing='none', hasconst='yes').fit() model.summary()
import numpy as np from statsmodels.api import add_constant, OLS, WLS import matplotlib.pyplot as plt # (x, y) is the set of observations. w contains precomputed weights; we'll # also compute these weights in this script. x, y, w = np.loadtxt('draper_smith_table9p1.txt', unpack=True) X = add_constant(x, prepend=True) # --- OLS --------------------------------------------------------------- # Ordinary least squares fit. ols_result = OLS(y, X).fit() print ols_result.summary() # Make a plot of the OLS residuals vs y and vs x. # The following recreates Fig. 9.1. plt.figure(1) plt.clf() plt.subplot(2, 1, 1) plt.plot(ols_result.fittedvalues, ols_result.resid, 'bo') plt.title("OLS Residuals versus fitted values") plt.xlabel('y') plt.ylabel('e') plt.grid() plt.subplot(2, 1, 2) plt.plot(x, ols_result.resid, 'bo') plt.title("OLS Residuals versus x") plt.xlabel("x")
def sumario(X, Y): '''Regressão mais completa da biblioteca statsmodels.api.''' X_sm = add_constant(X) results = OLS(Y, X_sm).fit() print(results.summary())
backward_model = linearModel(nba_data) backward_model.feature_selection('backward') backward_model.predict() print('Backward model R Squared value: ' + str(backward_model.r_squared)) # Pick the best model based on the R squared value models = { 'full model': model.r_squared, 'forward_model': forward_model.r_squared, 'backward_model': backward_model.r_squared } best = max(models, key=models.get) print('Based on the R Squared metric, the ' + str(best) + ' is the best choice.') # Print out the statistical summary for the best model new_model = OLS(forward_model.Y_train, forward_model.X_train).fit() print(new_model.summary()) # Add data column for predicted salaries from model: nba_data['Predicted_Salary'] = nba_data['2P'] * new_model.params[0] + nba_data[ 'AST'] * new_model.params[1] + nba_data['BLK'] * new_model.params[2] nba_data['Predicted_Salary'] nba_data['Salary_Residual'] = nba_data['salary_float'] - nba_data[ 'Predicted_Salary'] #NBA data sorted in order of most undervalued players: nba_data.sort_values('Salary_Residual', ascending=True) #NBA data sorted in order of most overvalued players: nba_data.sort_values('Salary_Residual', ascending=False)
mean["Month"] = date.map(lambda x: x.month) mean_long = mean.pivot( index="Year", columns="Month" ) mean_long.columns = range(1, 13) import patsy from statsmodels.api import OLS y, X = patsy.dmatrices("Mean ~ bs(Year, 5) + bs(Month, 5)", data=mean) model = OLS(y, X).fit() model.summary() mean["Pred"] = model.predict() mean.columns = ['Mean', 'Year', 'Month', 'Fitted mean'] m_long = mean.pivot(index="Month", columns="Year", values="Mean") d_long = mean.reset_index().pivot(index="Month", columns="Year", values="index") color = plt.cm.coolwarm(np.linspace(0.1, 0.9, 12)) mpl.rcParams['axes.prop_cycle'] = cycler.cycler('color', color)
# 2. double sample: Perform double sampling n = 67 n_i = 2*n # this is n' phase_1_sample = stock.sample(n=n_i) phase_2_sample = phase_1_sample.sample(n=n) print('First Phase Sample:',phase_1_sample.head()) print('Second Phase Sample:',phase_2_sample.head()) # 3. Regression analysis x~y -> vwap ~ changeOverTime: Perform a diagnostic analysis to determine if x and y have a linear # relationship and fitted line goes through the origin based on the sample data. Do regression analysis y ∼ x. from statsmodels.api import OLS x = phase_2_sample[['vwap']] y = phase_2_sample[['changeOverTime']] reg = OLS(y, x).fit() reg.summary() yi_sum = phase_2_sample['changeOverTime'].sum() yi_sum xi_sum = phase_2_sample['vwap'].sum() xi_sum r = yi_sum / xi_sum r print('ratio estimator (r) =', r) # 5. estimate your parameter of interest by ratio estimator: Estimate your parameter of interest by # ratio estimator. Estimate its variance and standard deviation. N = 368 t_hat_x = N/n_i*xi_sum t_hat_x t_hat_r = r*t_hat_x
LogStarAge = np.array([ 1.58103844, 1.06471074, 2.39789527, 0.72754861, 0.55675456, 1.91692261, 1.64865863, 1.38629436, 0.77472717, 1.36097655, 0., 1.80828877, 1.7837273, 0.64185389, 0.69813472, 2.39789527, -0.35667494, 1.79175947, 1.90210753, 1.39624469, 1.84054963, 2.19722458, 1.89761986, 1.84054963, 0.74193734, 0.55961579, 1.79175947, 0.91629073, 2.17475172, 1.36097655 ]) N = 30 intercept = np.ones(30) # ,columns=['interc','radius','orbit','metal','mass','age'] df = pd.DataFrame({ "intercept": intercept, "radius": LogPlanetRadius, "orbit": LogPlanetOrbit, "metal": StarMetallicity, "mass": LogStarMass, "age": LogStarAge }) y = pd.DataFrame([LogPlanetMass]) regr = linear_model.LinearRegression() regr.fit(df, LogPlanetMass) print('Intercept: \n', regr.intercept_) print('Coefficients: \n', regr.coef_) regols = OLS(LogPlanetMass, df).fit() regols.summary() # [0.15379, 1.402, -0.140, -1.5995, -0.956, -0.4617]
sns.heatmap(pearson_coefficeint,annot=True) #REGPLOT for highly correlated variables sns.regplot(x= "bid_price",y="ask_price", data = dfquote_s) #%% '''REGRESSION ANALYSIS''' x = dfquote[['bid_price','bid_size']] y = dfquote['ask_price'] x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3) LR=LinearRegression() LR.fit(x_train, y_train) y_pred=LR.predict(x_test) r2_score(y_test,y_pred) #0.9994800886579274 mod1=OLS(y,x).fit() print(mod1.summary()) OLS Regression Results ======================================================================================= Dep. Variable: ask_price R-squared (uncentered): 1.000 Model: OLS Adj. R-squared (uncentered): 1.000 Method: Least Squares F-statistic: 2.848e+09 Date: Fri, 25 Sep 2020 Prob (F-statistic): 0.00 Time: 15:26:32 Log-Likelihood: -8.0572e+06 No. Observations: 2158864 AIC: 1.611e+07 Df Residuals: 2158862 BIC: 1.611e+07 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ bid_price 1.0124 1.35e-05 7.52e+04 0.000 1.012 1.012
mod = smf.ols(formula='np.log(ADCResponse) ~ np.log(Intensity) + np.log(IntensitySD)*C(Z)', data=lo) res = mod.fit() res.summary() # 666 the number of the beast! Hell and fire were spawn to be released! plt.scatter(np.log(y), res.fittedvalues - np.log(y)) plt.show() plt.scatter(y[1:], y[:-1]) plt.show() lm = OLS(y, X).fit() lm.summary() X.columns X['logI'] = np.log(X.Intensity) Counter(X.Z) lm = OLS(np.log(y), X[['Mass','logI']]).fit() lm.summary() plt.scatter(np.log(y), lm.fittedvalues-np.log(y)) plt.show()