def _estimator_relationship(self): evaluation_dataframe = pd.DataFrame( 0.0, index=self.feature_value_dataframe.columns, columns=self.delta_dataframe.columns) p_value_dict = {} if self.estimator == 'correlation': for delta_col in self.delta_dataframe.columns: for feature_col in self.feature_value_dataframe.columns: correlation, pvalue = spearmanr( self.feature_value_dataframe[feature_col], self.delta_dataframe[delta_col], nan_policy='omit') if pd.isnull(correlation): correlation = 0 pvalue = 1 evaluation_dataframe.at[feature_col, delta_col] = correlation p_value_dict[(feature_col, delta_col)] = pvalue elif self.estimator == 'regression': X = self.feature_value_dataframe.values for delta_col in self.delta_dataframe.columns: y = self.delta_dataframe[delta_col].values ols = linear_model.LinearRegression() ols.fit(X, y) evaluation_dataframe[delta_col] = ols.coef_ for feature_col, p_val in zip( self.feature_value_dataframe.columns, stats.coef_pval(ols, X, y)): p_value_dict[(feature_col, delta_col)] = p_val return evaluation_dataframe, p_value_dict
def BAYESIAN(x: np.ndarray, y: np.ndarray) -> Tuple[int, int]: clf = BayesianRidge() clf.fit(x, y) m, q = clf.coef_[0], clf.intercept_ mean = clf.predict(y.reshape(-1, 1)) # This it's not an actual probability but it should be interpretable as one. try: p = 1 - np.nanmean(stats.coef_pval(clf, x, y)) except np.linalg.LinAlgError: p = np.nan return m, q, p
def _modified_regressor_summary(clf, X, y, xlabels=None): """ Output summary statistics for a fitted regression model. Parameters ---------- clf : sklearn.linear_model A scikit-learn linear model classifier with a `predict()` method. X : numpy.ndarray Training data used to fit the classifier. y : numpy.ndarray Target training values, of shape = [n_samples]. xlabels : list, tuple The labels for the predictors. """ # Check and/or make xlabels ncols = X.shape[1] if xlabels is None: xlabels = np.array( ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str') elif isinstance(xlabels, (tuple, list)): xlabels = np.array(xlabels, dtype='str') # Make sure dims of xlabels matches dims of X if xlabels.shape[0] != ncols: raise AssertionError( "Dimension of xlabels {0} does not match " "X {1}.".format(xlabels.shape, X.shape)) # Create data frame of coefficient estimates and associated stats coef_df = pd.DataFrame( index=['_intercept'] + list(xlabels), columns=['Estimate', 'Std. Error', 't value', 'p value'] ) coef_df['Estimate'] = np.concatenate( (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6))) coef_df['Std. Error'] = np.round(stats.coef_se(clf, X, y), 6) coef_df['t value'] = np.round(stats.coef_tval(clf, X, y), 4) coef_df['p value'] = np.round(stats.coef_pval(clf, X, y), 6) # Create data frame to summarize residuals resids = stats.residuals(clf, X, y, r_type='raw') resids_df = pd.DataFrame({ 'Min': pd.Series(np.round(resids.min(), 4)), '1Q': pd.Series(np.round(np.percentile(resids, q=25), 4)), 'Median': pd.Series(np.round(np.median(resids), 4)), '3Q': pd.Series(np.round(np.percentile(resids, q=75), 4)), 'Max': pd.Series(np.round(resids.max(), 4)), }, columns=['Min', '1Q', 'Median', '3Q', 'Max']) return resids_df, coef_df, {'R2': stats.metrics.r2_score(y, clf.predict(X)), 'Adj R2': stats.adj_r2_score(clf, X, y), 'F-statistic': stats.f_stat(clf, X, y)}
def coefficient_picks(self, coeff, type_of_reg): print("-----------------" + type_of_reg + "-----------------") num_of_nonzero = np.sum(np.abs(coeff.astype(float) != 0)) num_of_zero = coeff.size - num_of_nonzero print(type_of_reg + " picked " + str(num_of_nonzero) + " variables and eliminated the other " + str(num_of_zero) + " variables") print( "It picked these columns with these p-values \n", pd.concat([ pd.DataFrame(self.X.columns[coeff.astype(float) != 0.00]), pd.DataFrame( stats.coef_pval(self.regr_dict[type_of_reg], self.X_test, self.y_test)[1:][coeff.astype(float) != 0]) ], axis=1))
def LeastR(bt, PID, count): X = DrugUse.loc[PID][DrugUse.columns] nn = pd.value_counts(X.index).to_frame() nn = nn.sort_index() nn = nn.values S = np.array(nn) - 1 X = X.values Y = np.array(LabTest.loc[PID]['Lab Test Value']) Y = Y.reshape(-1, 1) Xbar = Xava.loc[PID] Ybar = Yava.loc[PID] Xbar = Xbar.values Ybar = Ybar.values Ybar = Ybar.reshape(-1, 1) t = np.random.randn(Y.shape[0], 1) Z = buildZ(nn, (Y.shape[0], len(PID))) # D = buildD(S, (np.sum(S), Y.shape[0])) delta = X - Z.dot(Xbar) Phi = Y - Z.dot(Ybar) - t # delta = D.dot(X) # Phi = D.dot(Y) Phi = Phi.reshape(-1) reg = linear_model.LassoCV(alphas=[0.0039], cv=5) reg.fit(delta, Phi) bt_new = reg.coef_.reshape(-1, 1) p_value = stats.coef_pval(reg, delta, Phi)[1:] p_value = p_value.reshape(-1, 1) bt_new = (bt * count) / (count + 1) + bt_new / (count + 1) return bt_new, p_value
def get_coefficients(data: pd.DataFrame): # cleaning data thien's way and applying lr d3 = data d3['data_lagged7'] = (d3.sort_values(by=['date'], ascending=True) .groupby(['campaign_name'])['common_cost'].shift(7)) d3['data_lagged1'] = (d3.sort_values(by=['date'], ascending=True) .groupby(['campaign_name'])['common_cost'].shift(1)) d3['data_lagged2'] = (d3.sort_values(by=['date'], ascending=True) .groupby(['campaign_name'])['common_cost'].shift(2)) d3['data_lagged3'] = (d3.sort_values(by=['date'], ascending=True) .groupby(['campaign_name'])['common_cost'].shift(3)) d3=d3.dropna(axis=1, how='all') d3=d3.dropna(subset=['data_lagged7','data_lagged1','data_lagged2','data_lagged3']) d_final=d3[['campaign_name','date','data_lagged1','data_lagged7', 'data_lagged2','facebookads_actions_post_reaction','facebookads_actions_landing_page_view', 'facebookads_actions_link_click','facebookads_actions_leadgen_other']] X = d_final[data.columns & ['facebookads_actions_post_reaction', 'facebookads_actions_landing_page_view', "facebookads_actions_link_click" ]] # here we have 2 variables for multiple regression. If you just want to use one variable for simple linear regression, then use X = df['Interest_Rate'] for example.Alternatively, you may add additional variables within the brackets Y = d_final['data_lagged1'] # with sklearn regr = linear_model.LinearRegression() regr.fit(X, Y) print('Intercept: \n', regr.intercept_) print('Coefficients: \n', regr.coef_) from regressors import stats print("coef_pval:\n", stats.coef_pval(regr, X, Y)) return { 'facebookads_actions_post_reaction': regr.coef_[0], 'facebookads_actions_landing_page_view': regr.coef_[1], "facebookads_actions_link_click": regr.coef_[2] }
sum_values = [0] * 10 for i in range(0, 1000): X_train, X_test = train_test_split(data, test_size=0.3) # Sets the dependant variables into their own data structures y_train = X_train["Generation [kWh]"] y_test = X_test["Generation [kWh]"] # Removes the dependant variables from the X sets X_train = X_train.drop(columns="Generation [kWh]") X_test = X_test.drop(columns="Generation [kWh]") ridge = Ridge(alpha=.005, normalize=True).fit(X_train, y_train) p_vals = stats.coef_pval(ridge, X_train, y_train) columns = [] i = 0 for column in X_train.columns: columns.append(column) i = 0 p_vals = p_vals[0:10] for value in p_vals: #print(f'{column}: {p_vals[i]}') sum_values[i] += p_vals[i] i += 1 for i in range(0, len(sum_values)): sum_values[i] = sum_values[i] / 1000
regr = regression regr.fit(train_X, train_y) print( f'--------------------------------- {regression_name} ---------------------------------' ) print('Train:', regr.score(train_X, train_y)) print('Val:', regr.score(val_X, val_y)) print('OOS:', regr.score(oos_X, oos_y)) from regressors import stats #print(stats.summary(regr, train_X, train_y.squeeze(), train_X.columns.tolist())) print( stats.summary(regr, oos_X, oos_y.squeeze().reshape(-1, 1), oos_X.columns.tolist())) p_vals = pd.Series(stats.coef_pval(regr, oos_X, oos_y.squeeze().reshape(-1, 1)), index=['_intercept'] + X_cols, name=out['iter_step']) p_vals['Regression Model'] = regression_name p_value_summary = pd.concat((p_value_summary, p_vals), axis=1) best_model = 'LinearRegression' if regression_name == best_model: coef = np.concatenate( (np.round(np.array([regr.intercept_]).reshape(-1, 1), 6), np.round((regr.coef_.reshape(-1, 1)), 6))).squeeze() coef = pd.Series(coef, index=['_intercept'] + X_cols, name=out['iter_step']) coef['Regression Model'] = regression_name
reg.score(Y, contents_value) reg = LinearRegression(fit_intercept = False).fit(dwellings_type, contents_value) reg.coef_ reg.score(dwellings_type, contents_value) reg = LinearRegression(fit_intercept = False).fit(Y[['current_income']], contents_value) reg.coef_ reg.score(Y[['current_income']], contents_value) lin = Lasso(alpha=0.0000000000001,precompute=True,max_iter=10000, positive=True, random_state=9999, selection='random', fit_intercept = False) lin.fit(dwellings_type, contents_value) lin.coef_ stats.coef_pval(rr_scaled, X_train, Y_train) # Define the Model model = lambda b, X: b[0] * X.iloc[:, 0] + b[1] * X.iloc[:, 1] + b[2] * X.iloc[:, 2] model = lambda b, X: b[0] * X.iloc[:, 0] + b[1] * X.iloc[:, 1] + b[2] * X.iloc[:, 2] + b[3] * X.iloc[:, 3] model = lambda b, X: b[0] * X.iloc[:, 0] * X.iloc[:, 3] + b[1] * X.iloc[:, 1] * X.iloc[:, 3] + b[2] * X.iloc[:, 2] obj = lambda b, Y, X: np.sum(np.abs(Y-model(b, X))**2) bnds = [(0, None), (0, None), (0, None), (0, None)] xinit = np.array([0, 0, 0, 0]) res = minimize(obj, args=(contents_value, dwellings_type), x0=xinit, bounds = bnds) print(f"b1={res.x[0]}, b2={res.x[1]}, b3={res.x[2]}") import pandas as pd import numpy as np from sklearn import datasets, linear_model from sklearn.linear_model import LinearRegression
import pandas as pd import numpy as np from sklearn.neural_network import MLPClassifier from sklearn import linear_model from regressors import stats data = pd.read_csv("engajamento.csv", sep=';') X = data[[ 'col', 'abert', 'aut', 'comp', 'conf', 'disp', 'freq', 'ident', 'visao', 'val', 'op', 'org', 'rel' ]] y = np.array(data['eng']) reg = linear_model.Ridge(alpha=.5) reg.fit(X, y) ols = linear_model.LinearRegression() ols.fit(X, y) print("coef_pval:\n", stats.coef_pval(ols, X, y)) print("\n=========== SUMMARY ===========") xlabels = [ 'col', 'abert', 'aut', 'comp', 'conf', 'disp', 'freq', 'ident', 'visao', 'val', 'op', 'org', 'rel' ] stats.summary(ols, X, y, xlabels)
def alpha_check(strategy, index_ticker="BSESN"): error = False success = False error_message_list = [] output = "" message = "Request Recieved" if strategy: strategy_returns = StrategyReturns.objects.filter( strategy=strategy).order_by('date') else: error = True success = False # message = "Strategy missing!" error_message_list.append("Strategy missing!") if index_ticker: index_returns = IndexDailyReturn.objects.filter( index__ticker=index_ticker).order_by('date') else: error = True success = False # message = "Index Code missing!" error_message_list.append("Index Code missing!") if not error: date_list_strat = list(map(lambda x: x.date, strategy_returns)) return_strat_list = list( map(lambda x: x.return_strategy, strategy_returns)) df_strategy = pd.DataFrame( { 'Date': date_list_strat, 'Return Strategy': return_strat_list }, columns=['Date', 'Return Strategy']) df_strategy['Excess Return Strategy'] = df_strategy[ 'Return Strategy'] - risk_free_rate date_list_index = list(map(lambda x: x.date, index_returns)) return_index_list = list(map(lambda x: x.return_1d, index_returns)) df_index = pd.DataFrame( { 'Date': date_list_index, 'Return Index': return_index_list }, columns=['Date', 'Return Index']) df_index['Excess Return Index'] = df_index[ 'Return Index'] - risk_free_rate df_final = pd.merge( df_strategy, df_index[['Date', 'Return Index', 'Excess Return Index']], on='Date', how='left') X = df_final['Excess Return Index'].values.reshape(-1, 1) Y = df_final['Excess Return Strategy'].values.reshape(-1, 1) col_x_mean = np.nanmean(X, axis=0) inds_x = np.where(np.isnan(X)) X[inds_x] = np.take(col_x_mean, inds_x[1]) col_y_mean = np.nanmean(Y, axis=0) inds_y = np.where(np.isnan(Y)) X[inds_y] = np.take(col_y_mean, inds_y[1]) regressor = LinearRegression() regressor.fit(X, Y) beta = regressor.coef_ alpha = regressor.intercept_ p_values = stats.coef_pval(regressor, X, Y) alpha_significance = p_values[0] beta_significance = p_values[1] strategy.alpha = alpha strategy.alpha_significance = alpha_significance strategy.beta = beta strategy.beta_significance = beta_significance strategy.save() error = False success = True message = "Alpha, Beta calculated" else: error = True success = False message = "Function input incorrect" return { 'output': output, 'message': message, 'error': error, 'error_message_list': error_message_list, 'success': success }
def calculate_p_values(self): return stats.coef_pval(self.model, self.params_df, self.result_nd)
trainy.append(x) #We are not using random sampling here. Since we have multiple seasons for NBA Teams #We are splitting the ten years of data to see how years 2009-2014 compare to 2014-19 #Then flip the script and see what happens to our fit. #Do any variables demonstrate temporal model fit differences? NBAX_train = NBA_x.iloc[trainx, :] NBAX_test = NBA_x.iloc[trainy, :] NBAY_train = NBA_y.iloc[trainx, :] NBAY_test = NBA_y.iloc[trainy, :] regressor = LinearRegression() regressor.fit(NBAX_train, NBAY_train) #training the algorithm ols = linear_model.LinearRegression() ols.fit(NBAX_train, NBAY_train) print("coef_pval:\n", stats.coef_pval(ols, NBAX_train, NBAY_train)) Data1 = [ stats.coef_tval(ols, NBAX_train, NBAY_train), stats.coef_pval(ols, NBAX_train, NBAY_train) ] Data2 = [ stats.coef_tval(ols, NBAX_test, NBAY_test), stats.coef_pval(ols, NBAX_test, NBAY_test) ] stats.adj_r2_score(ols, NBAX_train, NBAY_train) stats.adj_r2_score(ols, NBAX_test, NBAY_test) #To retrieve the intercept: print(regressor.intercept_) #For retrieving the slope: print(regressor.coef_)
) #tree.DecisionTreeClassifier() #LogisticRegression() #SVC(kernel="linear") #tree.DecisionTreeClassifier() #LogisticRegression()# rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(2)) rfecv.fit(x_data, y_data) print('number of features selected:', rfecv.n_features_) x_new = rfecv.transform(x_data) selected_inds = rfecv.get_support(indices=True) selected_ranks = rfecv.ranking_ selected_feats = [training_head[ind] for ind in selected_inds] #print(selected_feats) #print(rfecv.estimator_.coef_) pvals = stats.coef_pval(rfecv.estimator_, x_new, y_data) #print(pvals) cl1 = LogisticRegressionCV(cv=10, penalty='l2', fit_intercept=True) cl1.fit(x_data, y_data) coefs = cl1.coef_ intercept = cl1.intercept_[0] pvals_cur = stats.coef_pval(cl1, x_data, y_data) all_headers = [] all_headers.append('intercept') all_headers.extend(training_head)