def get_z_LinearRegression(self,xo,yo): print 'linear regression' dist_sigma = 1000 xx= self.Knots[:,0] yy= self.Knots[:,1] dd = np.sqrt((xx-xo)**2 + (yy-yo)**2) print "dd",dd exponent = -(dd**2)/(2*(dist_sigma**2)) print "exponent", exponent weights = np.exp(exponent) print "weights",weights X = self.Knots[:,0:2] X = sm.add_constant(X) y = self.Knots[:,2] mod_wls = sm.WLS(y, X, weights=weights) res_wls = mod_wls.fit() print(res_wls.summary()) p = np.zeros((2,2),dtype=X.dtype) p[0,0] = xo p[0,1] = yo p[1,0] = xo p[1,0] = yo p = sm.add_constant(p) z = res_wls.predict(p) print "zshape",z return z[0]
def detailedMultipleRegression(y, x): ones = np.ones(len(x[0])) X = sm.add_constant(np.column_stack((x[0], ones))) for ele in x[1:]: X = sm.add_constant(np.column_stack((ele, X))) results = sm.OLS(y, X).fit() return results # WARNING : coef not in right order !
def test_plot_influence(self, close_figures): infl = self.res.get_influence() fig = influence_plot(self.res) assert_equal(isinstance(fig, plt.Figure), True) # test that we have the correct criterion for sizes #3103 try: sizes = fig.axes[0].get_children()[0]._sizes ex = sm.add_constant(infl.cooks_distance[0]) ssr = sm.OLS(sizes, ex).fit().ssr assert_array_less(ssr, 1e-12) except AttributeError: import warnings warnings.warn('test not compatible with matplotlib version') fig = influence_plot(self.res, criterion='DFFITS') assert_equal(isinstance(fig, plt.Figure), True) try: sizes = fig.axes[0].get_children()[0]._sizes ex = sm.add_constant(np.abs(infl.dffits[0])) ssr = sm.OLS(sizes, ex).fit().ssr assert_array_less(ssr, 1e-12) except AttributeError: pass assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_set=None): """Report error rate on test_doc sentiments, using supplied model and train_docs""" train_targets, train_regressors = \ zip(*[( doc.sentiment, test_model.docvecs[doc.tags[0]] ) for doc in train_set ]) train_regressors = sm.add_constant(train_regressors) predictor = logistic_predictor_from_data(train_targets, train_regressors) test_data = test_set if infer: #if infer_subsample < 1.0: # test_data = sample(test_data, # int(infer_subsample * len(test_data))) #test_regressors = [test_model.infer_vector(doc.words, # steps=infer_steps, alpha=infer_alpha) # for doc in test_data] test_data = [SentimentDocument(None, None, None, s) for (v, s) in infer_set] test_regressors = [v for (v, s) in infer_set] else: test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_set] test_regressors = sm.add_constant(test_regressors) # predict & evaluate test_predictions = predictor.predict(test_regressors) corrects = ( sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data] )) errors = len(test_predictions) - corrects error_rate = float(errors) / len(test_predictions) return (error_rate, errors, len(test_predictions), predictor)
def regression(json_data, bandwidth): latency = [] rtt_by_size = [] # RTT object # rtt = {[avg_rtt1]: [rtt1, rtt2, rtt3, ..., rtcx], # [avg_rtt2]: [...]} for i in range(0, len(json_data)): latency.append(json_data[i]["latency"]) rtt_by_size.append(json_data[i]["size"] * json_data[i]["rtt"]) y = np.array(bandwidth).astype(np.float) z = np.array(latency).astype(np.float) r = np.array(rtt_by_size).astype(np.float) data = np.array([rtt_by_size, y]) ones = np.ones(len(data[0])) X = sm.add_constant(np.column_stack((data[0], ones))) for ele in data[1:]: X = sm.add_constant(np.column_stack((ele, X))) results = sm.OLS(z, X).fit() print results.summary()
def overfit_stocks(): # Load one year's worth of pricing data for five different assets start = datetime.date(1,1,2013) end = datetime.datetime(1,1,2014) x1 = get_pricing('PEP', ) x2 = get_pricing('MCD', fields='price', start_date=start, end_date=end) x3 = get_pricing('ATHN', fields='price', start_date=start, end_date=end) x4 = get_pricing('DOW', fields='price', start_date=start, end_date=end) y = get_pricing('PG', fields='price', start_date=start, end_date=end) # # Build a linear model using only x1 to explain y slr = regression.linear_model.OLS(y, sm.add_constant(x1)).fit() slr_prediction = slr.params[0] + slr.params[1]*x1 # # Run multiple linear regression using x1, x2, x3, x4 to explain y mlr = regression.linear_model.OLS(y, sm.add_constant(np.column_stack((x1,x2,x3,x4)))).fit() mlr_prediction = mlr.params[0] + mlr.params[1]*x1 + mlr.params[2]*x2 + mlr.params[3]*x3 + mlr.params[4]*x4 # # Compute adjusted R-squared for the two different models print('SLR R-squared: %.5f' %slr.rsquared_adj) print('SLR p-value: %.5f' %slr.f_pvalue) print('MLR R-squared: %.5f' %mlr.rsquared_adj) print('MLR p-value: %.5f' %mlr.f_pvalue) # # Plot y along with the two different predictions y.plot() slr_prediction.plot() mlr_prediction.plot() plt.ylabel('Price') plt.xlabel('Date') plt.legend(['PG', 'SLR', 'MLR']);
def make_g_model(daily_results, daily_projections): daily_results_common = unify_dfs(daily_results) dfm = create_master(daily_results_common) dfm['NF'] = dfm['NF'].astype(float) dfm = eliminate_zeros(dfm) X = pd.get_dummies(dfm[['Salary', 'RG', 'NF', 'RW', 'POS', 'Depth']]) X = pd.concat([X.drop('Depth',1), pd.get_dummies(X['Depth'])], 1) if 'POS_' in X.columns: X.drop('POS_', axis=1, inplace=True) #if 3 in X.columns: #X.drop(3, axis=1, inplace=True) print X.columns X = sm.add_constant(X) info = dfm[['Player', 'Date', 'Time']] y = dfm['FD'] model=sm.OLS(y, X).fit() today = daily_projections[date_string] X = pd.get_dummies(today[['Salary', 'RG', 'NF', 'RW', 'POS', 'Depth']]) X = pd.concat([X.drop('Depth',1), pd.get_dummies(X['Depth'])], 1) print X.columns if 'POS_' in X.columns: X.drop('POS_', axis=1, inplace=True) X = sm.add_constant(X) g_model = model.predict(X) return g_model
def weak_instruments(self, n_sims=20): np.random.seed(1692) model = feedforward.FeedForwardModel(19, 1, dense_size=60, n_dense_layers=2) treatment_effects = [] ols_betas, ols_ses = [], [] old_corrs, new_corrs = [], [] for _ in xrange(n_sims): df = self.treatment_gen.simulate_data(False) X = np.hstack((self.x, df['new_treat'].values[:, None])) Z = np.hstack((self.x, df['instrument'].values[:, None])) ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X) ols_betas.append(ols_beta) ols_ses.append(ols_se) old_corr = df[['instrument', 'new_treat']].corr().values[0, 1] new_instrument, new_corr = model.fit_instruments(X, Z, df['treatment_effect'].values, batchsize=128) new_corrs.append(new_corr) old_corrs.append(old_corr) Z2 = Z.copy() Z2[:, -1] = new_instrument[:, 0] iv = IV2SLS(df['treatment_effect'].values.flatten(), add_constant(X), add_constant(Z2)) model.reset_params() if new_corr: logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs)) logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses)) logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
def scatter(filename, x, y, line=True, xr=None, yr=None, x_title='', y_title='', title=None): if title is None: title = filename plt.figure(figsize=(24,18), dpi=600) plt.scatter(x, y) if xr is not None: plt.xlim(xr) if yr is not None: plt.ylim(yr) if line: est = sm.OLS(y, sm.add_constant(x)).fit() x_prime = np.linspace(min(x), max(x), 100)[:, np.newaxis] x_prime = sm.add_constant(x_prime) y_hat = est.predict(x_prime) line_plot1 = plt.plot(x_prime[:, 1], y_hat, 'r', alpha=0.9, label='r^2 = %s' % est.rsquared) #res = linregress(x,y) #line_plot2 = plt.plot([min(x), max(x)], [res[0]*min(x)+res[1], res[0]*max(x)+res[1]], # 'g', alpha=0.9, label='r^2 = %s' % res[2]) plt.legend(['r^2 = %s' % est.rsquared]) plt.xlabel(x_title) plt.ylabel(y_title) plt.title(title) plt.savefig('%s.png' % filename, format='png') plt.savefig('%s.eps' % filename, format='eps') plt.close()
def reg_m(y, x): ones = np.ones(len(x[0])) X = sm.add_constant(np.column_stack((x[0], ones))) for ele in x[1:]: X = sm.add_constant(np.column_stack((ele, X))) results = sm.OLS(y, X).fit() return results
def predict(self, test_X): dataset = self.__dataset intercept = self.__intercept XX_inv = self.__XX_inv beta = self.__beta train_X = sm.add_constant(dataset[:, :-1]) if intercept else dataset[:, :-1] test_X = sm.add_constant(vec(test_X)) if intercept else vec(test_X) train_Y = dataset[:, -1:] train_pred = np.dot(train_X, beta) # Confidence interval sig = (np.linalg.norm(train_Y-train_pred)**2/(train_X.shape[0]-train_X.shape[1]+1))**0.5 s = [] for row in range(test_X.shape[0]): x = test_X[[row], :] s.append(sig*(1 + np.dot(np.dot(x, XX_inv), x.T))**0.5) s = np.reshape(np.asarray(s), (test_X.shape[0], 1)) test_pred = np.dot(test_X, beta) hi_ci = test_pred + 2*s lo_ci = test_pred - 2*s return test_pred, hi_ci, lo_ci
def get_r_stat(sim_data): try: x = sm.add_constant(sim_data.rt_sampled.sort_values().reset_index().rt_sampled) y = sim_data.rt.sort_values().reset_index().rt fitted = sm.OLS(y,x).fit() log_x = sm.add_constant(sim_data.log_rt_sampled.sort_values().reset_index().log_rt_sampled) log_y = sim_data.log_rt.sort_values().reset_index().log_rt log_fitted = sm.OLS(log_y,log_x).fit() sub_out = pd.DataFrame([{'int_val': fitted.params.const, 'int_pval': fitted.pvalues.const, 'slope_val': fitted.params.rt_sampled, 'slope_pval':fitted.pvalues.rt_sampled, 'rsq': fitted.rsquared, 'rsq_adj': fitted.rsquared_adj, 'log_int_val': log_fitted.params.const, 'log_int_pval': log_fitted.pvalues.const, 'log_slope_val': log_fitted.params.log_rt_sampled, 'log_slope_pval': log_fitted.pvalues.log_rt_sampled, 'log_rsq': log_fitted.rsquared, 'log_rsq_adj': log_fitted.rsquared_adj}], index=[0]) except: sub_out = pd.DataFrame([{'int_val': np.nan, 'int_pval': np.nan, 'slope_val': np.nan, 'slope_pval': np.nan, 'rsq': np.nan, 'rsq_adj': np.nan, 'log_int_val': np.nan, 'log_int_pval': np.nan, 'log_slope_val': np.nan, 'log_slope_pval': np.nan, 'log_rsq': np.nan, 'log_rsq_adj': np.nan}], index=[0]) return sub_out
def GetCoef(start_train, end_train, StockReturns, CarhartDaily, SP500Returns, DataFolder): if os.path.isfile(r'%s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date())): Coef = pd.read_csv(r'%s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date())) return Coef else: Coef = pd.DataFrame() for ticker in StockReturns.ticker.unique(): print "Getting regression coefficient for %s" % ticker tmpReturn = StockReturns[(StockReturns.ticker == ticker)] if not tmpReturn.empty: tmpData = tmpReturn.merge(CarhartDaily, left_on = 'endDate', right_on = 'date') tmpData = tmpData.merge(SP500Returns, on = 'endDate') tmpData['SP500-RF'] = tmpData['SP500Return']*100 - tmpData['RF'] y = tmpData['return']*100 - tmpData['RF'] X1 = tmpData[['Mkt-RF', 'SMB', 'HML', 'UMD']] X2 = tmpData[['Mkt-RF']] X3 = tmpData[['SP500-RF']] X1 = sm.add_constant(X1) X2 = sm.add_constant(X2) X3 = sm.add_constant(X3) model1 = sm.OLS(y, X1).fit() model2 = sm.OLS(y, X2).fit() model3 = sm.OLS(y, X3).fit() tmpDF1 = pd.DataFrame(model1.params).T tmpDF1.rename( columns = {'const' : 'alphaFF'}, inplace = True) tmpDF2 = pd.DataFrame(model2.params).T tmpDF2.rename( columns = {'const' : 'alphaCAPM', 'Mkt-RF' : 'Mkt-RF_only'}, inplace = True) tmpDF3 = pd.DataFrame(model3.params).T tmpDF3.rename( columns = {'const' : 'alphaSP500'}, inplace = True ) tmpDF = pd.concat((tmpDF1, tmpDF2, tmpDF3), axis = 1) tmpDF['ticker'] = ticker Coef = Coef.append(tmpDF) Coef.to_csv(r'%s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date()), index = False) print 'Finished saving regression coefficients to: %s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date()) return Coef
def __init__(self): self.researched = util.source.read('F-F_Research_Data_5_Factors_2x3') self.portfolios = util.source.read('25_Portfolios_5x5') self.simpleFactor = sm.add_constant(self.researched.Mkt_RF) self.threeFactor = sm.add_constant(self.researched[['Mkt_RF', 'SMB', 'HML']]) self.fourFactor = sm.add_constant(self.researched[['Mkt_RF', 'SMB', 'RMW', 'CMA']]) self.fiveFactor = sm.add_constant(self.researched[['Mkt_RF', 'SMB', 'HML', 'RMW', 'CMA']])
def test_mvl_fuse_function(self): Y, D, P, T, G = generate_raw_samples() T = sm.add_constant(T, prepend=False) P = sm.add_constant(P, prepend=False) D = sm.add_constant(D, prepend=False) G = sm.add_constant(G, prepend=False) loo = LeaveOneOut(len(Y)) er = [] for train_idx, test_idx in loo: tm = taxi_view_model(train_idx, Y, T) pm = poi_view_model(train_idx, Y, P) gm = geo_view_model(train_idx, Y, G) dm = demo_view_model(train_idx, Y, D) models = [tm, pm, gm, dm] lm = mvl_fuse_function(models, train_idx, Y) tm_test = tm[0].predict(T[test_idx]) pm_test = pm[0].predict(P[test_idx]) gm_test = gm[0].predict(G[test_idx]) dm_test = dm[0].predict(D[test_idx]) newX_test = np.array([1, tm_test, pm_test, gm_test, dm_test]) ybar = lm.predict(newX_test) y_error = ybar - Y[test_idx] # if np.abs(y_error / Y[test_idx]) > 0.8: # print test_idx, ybar, Y[test_idx], newX_test er.append(y_error) mre = np.mean(np.abs(er)) / np.mean(Y) print "MVL with linear fusion function MRE: {0}".format(mre) self.visualize_prediction_error(er, Y, "MVL linear combination")
def models_pattern(data, matrix): y = np.array(data['survived']) ones = np.ones(len(matrix[0])) X = sm.add_constant(np.column_stack((matrix[0], ones))) for ele in matrix[1:]: X = sm.add_constant(np.column_stack((ele, X))) logit_model = sm.Logit(y, X) logit_res = logit_model.fit(maxiter=2000) print logit_res.summary() print logit_res.wald_test('1*x1 + 1*x2 + 1*x3') print probit_model = sm.Probit(y, X) probit_res = probit_model.fit(maxiter=2000) print probit_res.summary() print logit_res.wald_test('1*x1 + 1*x2 + 1*x3') print linear_model = sm.OLS(y, X) linear_res = linear_model.fit(maxiter=2000) result = 0. for array in X: for i, item in enumerate(array): result += linear_res.params[i] * item result /= (len(X)) print 'Linear function value: {}'.format(result) print linear_res.summary() print linear_res.wald_test('1*x1 + 1*x2 + 1*x3') print
def linear_model_plot(x_variable, y_variable): '''Function develops linear model for x and y variable inputs and plots regression line on top of scatter plot''' assert len(x_variable) > 1, 'length of x_variable should be larger than 1' assert len(y_variable) > 1, 'length of y_variable should be larger than 1' # assigning function variables to response and predictor variables y = y_variable # response variable X = x_variable # predictor variable X = sm.add_constant(X) # Adds a constant term to the predictor (essential to obtain the constant in the formula) # Calculating the linear model for the two variables lm = sm.formula.OLS(y, X).fit() # Developing the plot of the linear model # making a range of the x variable to pass to the y prediction x_pred = np.linspace(x_variable.min(), x_variable.max()) # Adding a constant to this range of x values (essential to obtain the constant in the formula) x_pred2 = sm.add_constant(x_pred) # Passing the linear model predictor the range of x values to model over y_pred = lm.predict(x_pred2) # Plotting these predicitons on the graph plt.plot(x_pred, y_pred, color='k', linewidth=2) # Obtaining linear regression return plt.plot()
def inbound_forcast(target, exchange, geo, exchange_test, geo_test, submit, i): for col in col_list: # 宿泊者数のカラム名を指定 target_col = col + suff target.index = range(0, 365) X = sm.add_constant(exchange, prepend=False) X_test = sm.add_constant(exchange_test, prepend=False) X.index = range(0,365) for g in range(0, len(target)): if target[target_col][g] == 0: target[target_col][g] = 1 y = target[target_col].apply(np.log) model = sm.OLS(y, X) results = model.fit() print(results.summary()) pred = results.predict() Y = y - pred L = len(Y) fftY = fft.fft(Y) freqs = fft.fftfreq(L) power = np.abs(fftY) phase = [np.arctan2(float(c.imag), float(c.real)) for c in fftY] wave = newwave_i(L, results, pred, power, freqs, phase, X_test) submit[i] = wave i += 1 return submit, i
def calcSScoreAgainstExisting(beta_mfac, beta_aux, df, n_aux): y = df.iloc[:, 0] X = df.iloc[:, 1:] resid_mfac = y - sm.add_constant(X).dot(beta_mfac) aux = calcAuxilaryArray(resid_mfac, n_aux) resid_aux = aux[1:] - sm.add_constant(aux[:-1]).dot(beta_aux) return calcSScore(beta_aux, resid_aux, aux[-1])
def get_recommend(): # repeat process above to get data for recommendations data = flask.request.json headline = data["headline"] content = data["content"].encode('utf8') tags = data["tags"] day_published = data["day_pub"] channel = data["channel"] num_imgs = int(data["num_imgs"]) index = [0] columns = [u'LDA_0_prob', u'LDA_1_prob', u'LDA_2_prob', u'LDA_3_prob', u'LDA_4_prob', u'LDA_5_prob', u'LDA_6_prob', u'LDA_7_prob', u'LDA_8_prob', u'LDA_9_prob', u'average_token_length_content', u'average_token_length_title', u'avg_negative_polarity', u'avg_positive_polarity', u'data_channel_is_bus', u'data_channel_is_entertainment', u'data_channel_is_lifestyle', u'data_channel_is_socmed', u'data_channel_is_tech', u'data_channel_is_world', u'global_grade_level', u'global_rate_negative_words', u'global_rate_positive_words', u'global_reading_ease', u'global_sentiment_abs_polarity', u'global_sentiment_polarity', u'global_subjectivity', u'is_weekend', u'max_abs_polarity', u'max_negative_polarity', u'max_positive_polarity', u'min_negative_polarity', u'min_positive_polarity', u'n_tokens_content', u'n_tokens_title', u'num_imgs', u'num_tags', u'num_videos', u'r_non_stop_unique_tokens', u'r_non_stop_words', u'r_unique_tokens', u'rate_negative_words', u'rate_positive_words', u'title_sentiment_abs_polarity', u'title_sentiment_polarity', u'title_subjectivity', u'weekday_is_friday', u'weekday_is_monday', u'weekday_is_saturday', u'weekday_is_sunday', u'weekday_is_thursday', u'weekday_is_tuesday', u'weekday_is_wednesday'] data_df = pd.DataFrame(index=index, columns=columns) create_metadata_fields(data_df, num_imgs, tags, day_published, channel) create_NLP_features(data_df, headline, content) create_lda_features(data_df, content) results = {} create_metadata_fields(results, num_imgs, tags, day_published, channel) create_NLP_features(results, headline, content) create_lda_features(results, content) results['est_shares'] = round(pois_reg.predict(sm.add_constant(data_df))[0],-2) results['est_prob'] = round(RF_class.predict_proba(sm.add_constant(data_df))[0][1],2) # change day of week data to sunday data_df['weekday_is_monday'] = 0 data_df['weekday_is_tuesday'] = 0 data_df['weekday_is_wednesday'] = 0 data_df['weekday_is_thursday'] = 0 data_df['weekday_is_friday'] = 0 data_df['weekday_is_saturday'] = 0 data_df['weekday_is_sunday'] = 1 data_df['is_weekend'] = 1 # get results if change to sunday results['est_shares_sun'] = round(pois_reg.predict(sm.add_constant(data_df))[0],-2) results['est_prob_sun'] = round(RF_class.predict_proba(sm.add_constant(data_df))[0][1],2) # return results to javascript return flask.jsonify(results)
def main(): data = sm.datasets.star98.load() print sm.datasets.star98.NOTE # Load the data and add a constant to the exogenous (independent) variables data.exog = sm.add_constant(data.exog, prepend=False) print data.endog[:5, :], '\n' # The dependent variable is N by 2 (Success: NABOVE, Failure: NBELOW): print data.exog[:2, :], '\n' # Fit and summary glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial()) res = glm_binom.fit() print res.summary(), '\n' # We extract information that will be used to draw some interesting plots nobs = res.nobs y = data.endog[:, 0] / data.endog.sum(1) yhat = res.mu #Plot yhat vs y: plt.figure() plt.scatter(yhat, y) line_fit = sm.OLS(y, sm.add_constant(yhat, prepend=False)).fit().params fit = lambda x: line_fit[1] + line_fit[0] * x # better way in scipy? plt.plot(np.linspace(0, 1, nobs), fit(np.linspace(0, 1, nobs))) plt.title('Model Fit Plot') plt.ylabel('Observed Values') plt.xlabel('Fitted Values') # Plot yhat vs. Pearson residuals: plt.figure(); plt.scatter(yhat, res.resid_pearson); plt.plot([0.0, 1.0], [0.0, 0.0], 'k-'); plt.title('Residual Dependence Plot'); plt.ylabel('Pearson Residuals'); plt.xlabel('Fitted values'); # Histogram of standardized deviance residuals plt.figure(); resid = res.resid_deviance.copy() resid_std = (resid - resid.mean()) / resid.std() plt.hist(resid_std, bins=25); plt.title('Histogram of standardized deviance residuals'); # QQ Plot of Deviance Residuals from statsmodels import graphics graphics.gofplots.qqplot(resid, line='r'); test = 1
def regrFromDataframe(df, y_attr, X_attrs, date, n_aux, n_mfac): df = getLookbackPeriod(df, date, n_mfac) model_mfac = sm.OLS(df[y_attr], sm.add_constant(df[X_attrs])).fit() beta_mfac = model_mfac.params resid_mfac = model_mfac.resid aux = calcAuxilaryArray(resid_mfac, n_aux) aux_res = sm.OLS(aux[1:], sm.add_constant(aux[:-1])).fit() return calcSScore(aux_res.params, aux_res.resid, aux[-1]), beta_mfac, aux_res.params
def main(): # Process input data #json_data=open('C:/Users/rthomas/Documents/DemandPrediction/demand_prediction.json') json_data = open(sys.argv[1]) x, y, last, total_hours = process_input(json_data) FUTURE_DAYS = 15 # will make prediciton 15 days into future # I looked at a few different regression families in sm.GLM but found very similar rms errors so I chose to use a simple linear regression trend_model = sm.OLS(y, sm.add_constant(range(len(x)), prepend=True)).fit() trend = trend_model.fittedvalues # y1 is y with the trend line (growth over time) removed y1 = y - trend # y2 is y1 with weekly trends removed days = [w.weekday() for w in x] days_mean = [np.mean([y1[i] for i in range(0,len(y1)) if days[i] == k]) for k in range(7)] y2 = [y1[i] - days_mean[days[i]] for i in range(len(y1))] # y3 is y2 with daily trends removed hours = [w.hour for w in x] hours_mean = [np.mean([y2[i] for i in range(0,len(y2)) if hours[i] == k]) for k in range(24)] y3 = [y2[i] - hours_mean[hours[i]] for i in range(len(y2))] trend_y = [days_mean[days[i]] + hours_mean[hours[i]] + trend[i] for i in range(len(trend))] # The predict function is not working the way I expected for future times # construct an ARMA model on the residuals once all trends have been removed #arma_model = sm.tsa.ARMA(y3) #arma_fit = arma_model.fit(order=(6,6)) #arma_res = arma_fit.predict() #arma_y = [arma_res[i] + trend_y[i] for i in range(len(trend))] future_hours = FUTURE_DAYS*24 + total_hours future_trend = trend_model.predict(sm.add_constant(range(total_hours, future_hours), prepend=True)) future_x = [last + datetime.timedelta(hours=k) for k in range(1,FUTURE_DAYS*24+1)] future_hours = [w.hour for w in future_x] future_days = [w.weekday() for w in future_x] future_hours_trend = [hours_mean[future_hours[i]] for i in range(len(future_x))] future_days_trend = [days_mean[future_days[i]] for i in range(len(future_x))] future_all_trends = [sum(tuple) for tuple in zip(future_trend, future_hours_trend, future_days_trend)] #plot_series(x, [y, trend_y, arma_y], ['Time Series', 'All trends', 'All trends + ARMA']) #rms_arr = [] #for predict in [trend_y, arma_y]: # rms_arr.append(rms(y, predict)) app = flask.Flask(__name__) app.run()
def classify(self, mp, x_train, y_train, x_test): x_train_reg = sm.add_constant(x_train) x_test_reg = sm.add_constant(x_test) logit = sm.Logit(y_train, x_train_reg) clf = logit.fit(disp=0) # print(clf.summary()) log_to_info('Fitting a Logistic Regression to labeled training data...') log_to_info('Predicting test value') y_test = clf.predict(x_test_reg) log_to_info('Done!') return numpy.rint(y_test)
def estender_serie_anos(serie, df, grau_polinomio=2): mask = np.isfinite(df.index) & np.isfinite(df[serie]) polinomio = np.array([np.array(np.power(df.index[mask],i)) for i in range(1,grau_polinomio+1)]) polinomio = sm.add_constant(polinomio.T) reg = sm.OLS(df[serie][mask], polinomio).fit() polifit = np.array([np.array(np.power(df.index,i)) for i in range(1,grau_polinomio+1)]) polifit = sm.add_constant(polifit.T) return reg.predict(polifit)
def __init__(self,args): self.bed = Bed(args.bfile) # self.N = self.bed.iid_count if args.covfile is not None: cov = pd.read_table(args.covfile,header=None) self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid)) self.ncov = self.cov.shape[1] # + constant else: self.cov = np.ones((self.N,1)) self.ncov = 1 # Constant if args.phenofile is not None: Y = pd.read_table(args.phenofile,header=None,na_values='-9') else: try: Y = pd.read_table(args.bfile+'.pheno',header=None,na_values='-9') except IOError: print("Phenotype file not found.") exit(1) self.Y = ju._reorder(Y,self.bed.iid) af = ju.get_allele_frequency(self.bed,args) # snps = (af>args.maf)&(af<1-args.maf) # if (args.from_bp is not None) and (args.to_bp is not None): k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp) snp1 = snps&k snps_to_use = self.bed.sid[snps] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) # pos = self.bed.pos[self.bed_index] # bim=pd.read_table(self.bed.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) self.af = af[self.bed_index] # self.M = len(self.bed_index) # self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type) self.pos = pos[:,2] self.chr = pos[:,0] self.id = self.bed.sid[self.bed_index] self.A1 = bim['a1'].loc[self.bed_index] self.A2 = bim['a2'].loc[self.bed_index] self.logistic = False self.chimin = stats.chi2.ppf(1-args.minp,2) # Fit null if (not args.linear) and (self.Y.min() >= 0 and self.Y.max() <= 1): self.null = sm.Logit(self.Y, self.cov, missing='drop').fit(disp=0) self.logistic = True else: self.null = sm.OLS(self.Y, self.cov, missing='drop').fit(disp=0) if self.ncov > 1: self.cov = sm.add_constant(self.null.fittedvalues) self.marg_res, self.joint_res = self.compute(args)
def scatterColor(x0, y, w): """Creates scatter plot with points colored by variable. All input arrays must have matching lengths Arg: x0 (array): array of x values y (array): array of y values w (array): array of scalar values Returns: slope and intercept of best fit line """ import matplotlib as mpl import matplotlib.cm as cm import statsmodels.api as sm from scipy.stats import linregress cmap = plt.cm.get_cmap('RdYlBu') norm = mpl.colors.Normalize(vmin=w.min(), vmax=w.max()) m = cm.ScalarMappable(norm=norm, cmap=cmap) m.set_array(w) sc = plt.scatter(x0, y, label='', color=m.to_rgba(w)) xa = sm.add_constant(x0) est = sm.RLM(y, xa).fit() r2 = sm.WLS(y, xa, weights=est.weights).fit().rsquared slope = est.params[1] x_prime = np.linspace(np.min(x0), np.max(x0), 100)[:, np.newaxis] x_prime = sm.add_constant(x_prime) y_hat = est.predict(x_prime) const = est.params[0] y2 = [i * slope + const for i in x0] lin = linregress(x0, y) x1 = np.arange(np.min(x0), np.max(x0), 0.1) y1 = [i * lin[0] + lin[1] for i in x1] y2 = [i * slope + const for i in x1] plt.plot(x1, y1, c='g', label='simple linear regression m = {:.2f} b = {:.0f}, r^2 = {:.2f}'.format(lin[0], lin[1], lin[2] ** 2)) plt.plot(x1, y2, c='r', label='rlm regression m = {:.2f} b = {:.0f}, r2 = {:.2f}'.format(slope, const, r2)) plt.legend() cbar = plt.colorbar(m) cbar.set_label('use cbar.set_label("label") to label this axis') return slope, const
def reg_m(y, x1,x2,x3): y = y.reshape(-1) x = np.array([x1.reshape(-1),x2.reshape(-1), x3.reshape(-1)]) ones = np.ones(len(x[0])) X = sm.add_constant(np.column_stack((x[0], ones))) for ele in x[1:]: X = sm.add_constant(np.column_stack((ele, X))) results = sm.OLS(y, X).fit() yy = results.params[0]*x1 + results.params[1]*x2 + results.params[2]*x3 + results.params[3] yy = yy/yy.max() print (results.params) print (results.summary()) return results,yy
def classify(self, mp, x_train, y_train, x_test): x_train = sm.add_constant(x_train) x_test = sm.add_constant(x_test) clf = LogisticRegressionCV(verbose=1, cv=5) log_to_info('Fitting a Logistic Regression to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Training details') log_to_info('Classifier parameters: {}'.format(clf.get_params())) log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0)) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def setup_class(cls): data = sm.datasets.randhie.load(as_pandas=False) cls.endog = data.endog exog = sm.add_constant(data.exog[:,1:4], prepend=False) exog_infl = sm.add_constant(data.exog[:,0], prepend=False) cls.res1 = sm.ZeroInflatedGeneralizedPoisson(data.endog, exog, exog_infl=exog_infl, p=1).fit(method='newton', maxiter=500, disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset', 'p'] cls.init_kwds = {'inflation': 'logit', 'p': 1} res2 = RandHIE.zero_inflated_generalized_poisson cls.res2 = res2
trt1 = data['weight'][data.group == 'trt1'] trt2 = data['weight'][data.group == 'trt2'] from scipy import stats F, p = stats.f_oneway(ctrl, trt1, trt2) F p import statsmodels.api as sm from sklearn.preprocessing import LabelEncoder le = LabelEncoder() data.group = le.fit_transform(data.group) x = data.group y = data.weight x = sm.add_constant(x) model = sm.OLS(y, x).fit() model.summary() sm.stats.anova_lm(model, typ=2) #cravens example df = pd.read_excel('cravens.xlsx') from scipy.stats import skew #sales sales = df.Sales #mean sales.mean() #median sales.median() #mode sales.mode()[0]
import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt # Prepare Data to Plot x = np.arange(1, 10, 0.1) err = np.random.randn(len(x)) y = x**2 + err # X = (1 | x) X = np.matrix(x).T # To Column matrix X = sm.add_constant(X) # Add constant Y = np.matrix(y).T # To Column matrix def find_beta_hat(X, y): # X should be np.matrix return np.linalg.pinv(X) * y def find_y_hat(X, y): beta_hat = find_beta_hat(X, y) return X * beta_hat Y_hat = find_y_hat(X, Y) with plt.xkcd(): # Prepare Plot plt.figure(figsize=(10,6), dpi=300) plt.title(r"OLS...?", fontsize=16) plt.xlabel(r'x', fontsize=14) plt.ylabel(r'y', fontsize=14) # Plot with Legends
data = data[data.proccessor_turbo != "Not found"] data["proccessor"] = to_numeric(data["proccessor"]) data["proccessor_turbo"] = to_numeric(data["proccessor_turbo"]) #print(data.info()) x = data[["size", "proccessor", "proccessor_turbo", "ram", "hdd"]] y = data["price"] regr = linear_model.LinearRegression() regr.fit(x, y) print("Intercept: ", regr.intercept_) print("Coeff: ", regr.coef_) print("Score: ", regr.score(x, y)) new_size = 15.6 new_proccessor = 1.6 new_proccessor_turbo = 3.9 new_ram = 12 new_hdd = 1250 predicted = regr.predict( [[new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]]) print("Predicted: ", predicted) x = add_constant(x) model = OLS(y, x).fit() predicted = model.predict( [[1, new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]]) print(model.summary())
from sklearn.metrics import roc_curve, auc from pylab import mpl if __name__ == '__main__': mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 data = pd.read_csv('WoeData.csv') Y = data['SeriousDlqin2yrs'] X = data.drop([ 'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'NumberOfDependents' ], axis=1) X1 = sm.add_constant(X) logit = sm.Logit(Y, X1) result = logit.fit() print(result.params) print(result.summary()) test = pd.read_csv('TestWoeData.csv') Y_test = test['SeriousDlqin2yrs'] X_test = test.drop([ 'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'NumberOfDependents' ], axis=1) X3 = sm.add_constant(X_test)
ax.set_ylabel('Average procurement cost savings [USD/MWh]') ppt.savefig(run + '/41_WSpricestd_savings.png', bbox_inches='tight') ppt.savefig(run + '/41_WSpricestd_savings.pdf', bbox_inches='tight') print('Savings as a function of price std') print(str(reg2.intercept_[0]) + ' + ' + str(reg2.coef_[0][0]) + '* std') print('R2: ' + str( reg2.score( np.sqrt(df_results['var_p'].to_numpy()).reshape(len(df_results), 1), (df_results['difference']).to_numpy().reshape(len(df_results), 1)))) # Y = (df_results['difference']).to_numpy().reshape(len(df_results), 1) X = np.sqrt(df_results['var_p']) X = sm.add_constant(X) model = sm.OLS(Y, X) results = model.fit() print(results.summary()) df_results['std_p'] = np.sqrt(df_results['var_p']) df_results_short = df_results.loc[df_results['std_p'] < 90.] Y2 = (df_results_short['difference']).to_numpy().reshape( len(df_results_short), 1) X2 = df_results_short['std_p'].to_numpy().reshape(len(df_results_short), 1) X2 = sm.add_constant(X2) model2 = sm.OLS(Y2, X2) results2 = model2.fit() print(results2.summary()) import pdb
Build A Predictive Model And Conclude If Both Predictors (Independent Variables) Are Significant For A Students’ Height Or Not When Father’s Height Is Held Constant, The Average Student Height Increases By How Many Inches For Each One-Inch Increase In Mother’s Height. When Mother’s Height Is Held Constant, The Average Student Height Increases By How Many Inches For Each One-Inch Increase In Father’s Height. """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm fs_df = pd.read_csv('Female_Stats.csv') features = fs_df.iloc[:, 1:].values labels = fs_df.iloc[:, 0].values #This is done because statsmodels library requires it to be done for constants. #features = np.append(arr = np.ones((30, 1)), values = features, axis = 1) features = sm.add_constant(features) #adds a constant column to input data set. features_opt = features[:, [0, 1, 2]] regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit() regressor_OLS.summary() """So we conclude that Both Predictors (Independent Variables) Are Significant For A Students’ Height""" regressor_OLS.pvalues print("Effect of mom's height", regressor_OLS.params[1]) print("Effect of dad's height", regressor_OLS.params[2])
def reconstruct(): """ run KFOLD method for regression """ #import packages import os import pandas as pd import statsmodels.api as sm from datetime import datetime from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 79 y = 80 #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) { # #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # lm = LinearRegression() # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) # longitude = surge['lon'][0] # latitude = surge['lat'][0] # num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') } num_pc = X_pca.shape[1] #number of principal components longitude = surge['lon'][0] latitude = surge['lat'][0] #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis=1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1) #standardize predictor data dat = pred_for_recon.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat( [pred_for_recon['date'], dat_standardized], axis=1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #model preparation #first train model using observed surge and corresponding predictors X_pca = sm.add_constant(X_pca) est = sm.OLS(y['surge'], X_pca).fit() #predict with X_recon and get 95% prediction interval X_pca_recon = sm.add_constant(X_pca_recon) predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05) #drop confidence interval and mean_se columns predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \ axis = 1, inplace = True) #final dataframe final_dat = pd.concat([pred_standardized['date'], predictions], axis=1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] { # plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') # prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) # confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name)
'night_charge'] + churn['intl_charge'] churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0) churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0) dependent_variable = churn['churn'] # independent_variables = churn[['account_length', 'custserv_calls', 'total_charges']] # independent_variables = churn[['account_length', 'area_code', 'intl_plan', 'vmail_plan' , # 'vmail_message', 'day_mins', 'day_calls', # 'day_charge', 'intl_mins', 'intl_charge', 'custserv_calls' ,'total_charges']] independent_variables = churn[[ 'account_length', 'area_code', 'intl_plan', 'vmail_plan', 'vmail_message', 'day_mins', 'day_calls', 'day_charge', 'intl_mins', 'intl_charge', 'custserv_calls' ]] independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True) logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit() new_observatios = churn.loc[churn.index.isin(range(20)), independent_variables.columns] new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True) y_predicted = logit_model.predict(new_observatios_with_constant) y_predicted_rounded = [round(score, 0) for score in y_predicted] print(y_predicted_rounded) logistic_predicted_value_list = [] total_count = 0 index = 0 total_number = len(y_predicted_rounded) total_correct = 0
ax.set_xticks(0.5+np.arange(len(top_comb)), minor = False) ax.set_xticklabels(top_comb, ha='center', rotation = 20) ax.set_yticks(0.5+np.arange(len(top_comb)), minor = False) ax.set_yticklabels(top_comb[::-1], va='center', rotation = 30) #cax = fig.add_subplot(gs[6, :]) cax = plt.axes([0.1, 0.1, 0.8, 0.05]) cb = plt.colorbar(gigifig, cax=cax, orientation='horizontal', extend = 'both') cb.ax.tick_params(labelsize=18) cb.set_label('Cross correlation', fontsize=20) plt.subplots_adjust(left=0.1, bottom=0.17, right=0.98, top=0.92, wspace=0.05, hspace=0.20) fig.savefig(cart_out + 'Crosscorr_{}_{}_v2_{}driv_{}_{}.pdf'.format(reg, namti, nu, ensmod, katullo)) Xgi = sm.add_constant(X) pvals = [] params = [] rsq = [] for ko in Y.T: est = sm.OLS(ko, Xgi) est2 = est.fit() est3 = est.fit_regularized(method='elastic_net', alpha=0.0) # This is a Ridge regression. if np.any(est2.params - est3.params > 1.e-3): print('AAAAA - 3') print(est2.params) print(est3.params) elif np.any(est2.params - est3.params > 1.e-2): raise ValueError('AAAAAAAAAAAAAA - 2') pvals.append(est2.pvalues)
df = DataFrame(Data, columns=[ 'country', 'year', 'status', 'life_expectancy', 'alcohol', 'percentage_expenditure', 'measles', 'bmi', 'polio', 'total_expenditure', 'gdp', 'population', 'schooling' ]) ax = df.plot(x="schooling", y="life_expectancy", style="o") ax.set_ylabel("life_expectancy") # In[2]: import statsmodels.api as sm #load the library and assigns a nickname X = df[["schooling"]].values # the independent variable(s) X = sm.add_constant(X) # add the intercept term y = df["life_expectancy"].values # the dependent variable ols = sm.OLS(y, X).fit() # run the regression ols.summary() # In[3]: X = df[['alcohol', 'measles', 'gdp', 'schooling']].values # the independent variable(s) X = sm.add_constant(X) # add the intercept term y = df["life_expectancy"].values # the dependent variable ols = sm.OLS(y, X).fit() # run the regression ols.summary() # In[ ]:
start_date = y.index[0] x.tail() y.tail() end_date = x.index[-1] y = y[start_date:end_date] x = x[start_date:end_date] y.head() x.head() len(y) len(x) import statsmodels.api as sm X = sm.add_constant(x) ## let's add an intercept (beta_0) to our model # Note the difference in argument order model = sm.OLS(y, X).fit() ## sm.OLS(output, input) predictions = model.predict(X) # Print out the statistics model.summary() import statsmodels.formula.api as smf d = { "x": pd.Series(x), "y": pd.Series(y)} df = pd.DataFrame(d) mod = smf.ols('y ~ x', data=df).fit() print(mod.summary())
# Our linear model with a single independent variable on the left-hand side assumes the following form: # # $$y = \beta_0 + \beta_1 X_1 + \epsilon$$ # # $\epsilon$ accounts for the deviations or errors that we will encounter when our data do not actually fit a straight line. When $\epsilon$ materializes, that is when we run the model of this type on actual data, the errors are called **residuals**. # #### Estimate a simple regression with statsmodels # The upper part of the summary displays the dataset characteristics, namely the estimation method, the number of observations and parameters, and indicates that standard error estimates do not account for heteroskedasticity. # # The middle panel shows the coefficient values that closely reflect the artificial data generating process. We can confirm that the estimates displayed in the middle of the summary result can be obtained using the OLS formula derived previously: # In[5]: X = sm.add_constant(data['X']) model = sm.OLS(data['Y'], X).fit() print(model.summary()) # #### Verify calculation # In[6]: beta = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y)) pd.Series(beta, index=X.columns) # #### Display model & residuals # In[7]: data['y-hat'] = model.predict()
import os import pandas as pd import statsmodels.api as sm os.chdir('/Users/tom/Dropbox/Economics/Econometrics/Homework/HW8') df = pd.read_csv('clean.csv') df = sm.add_constant(df, prepend=True) # a.) ln(wage) on educ, exper, expersq, black, south, smsa, reg661 # through reg668 and smsa66. endog_a = df['lwage'] exog = ['const', 'educ', 'exper', 'expersq', 'black', 'south', 'smsa', 'reg661', 'reg662', 'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668', 'smsa66'] exog_a = df[exog] endog_a.head() exog_a.head() model_a = sm.OLS(endog_a, exog_a) results_a = model_a.fit() print(results_a.summary()) #b: educ on exog_a - educ + nearc4 endog_b = df['educ'] exog_b = exog_a.drop('educ', axis=1).join(df['nearc4']) model_b = sm.OLS(endog_b, exog_b)
# backward step model = sm.Logit(y, sm.add_constant(pd.DataFrame( X[included]))).fit(disp=0) # use all coefs except intercept pvalues = model.pvalues.iloc[1:] worst_pval = pvalues.max() # null if pvalues is empty if worst_pval > threshold_out: changed = True worst_feature = pvalues.argmax() included.remove(worst_feature) if verbose: print('Drop {:30} with p-value {:.6}'.format( worst_feature, worst_pval)) if not changed: break return included if __name__ == '__main__': from sklearn.datasets import load_breast_cancer data = load_breast_cancer(return_X_y=False) X = pd.DataFrame(data.data, columns=data.feature_names) y = pd.Series(data.target, name='response') selected_vars = stepwise_selection(X, y) print('resulting features:') print(selected_vars) model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[selected_vars]))).fit()
regr = linear_model.LinearRegression() regr.fit(X, Y) print('Intercept: \n', regr.intercept_) print('Coefficients: \n', regr.coef_) #Stock_Index_Price = (Intercept) + (Interest_Rate coef)*X1 + (Unemployment_Rate coef)*X2 #Stock_Index_Price = (1798.4040) + (345.5401)*X1 + (-250.1466)*X2 # prediction with sklearn New_Interest_Rate = 2.75 New_Unemployment_Rate = 5.3 print ('Predicted Stock Index Price: \n', regr.predict([[New_Interest_Rate ,New_Unemployment_Rate]])) #Interest Rate = 2.75 (i.e., X1= 2.75) #Unemployment Rate = 5.3 (i.e., X2= 5.3) Stock_Index_Price = (1798.4040) + (345.5401)*(2.75) + (-250.1466)*(5.3) #1422.86 Stock_Index_Price # with statsmodels import statsmodels.api as sm #2nd method X = sm.add_constant(X) # adding a constant model = sm.OLS(Y, X).fit() predictions = model.predict(X) print_model = model.summary() print(print_model) #coefficients captured in this table match with the coefficients generated by sklearn. #we got consistent results by applying both sklearn and statsmodels. #ref #https://datatofish.com/multiple-linear-regression-python/ #end here
#partition data into training and test sets from sklearn.model_selection import train_test_split X = df_dum.drop('avg_salary', axis=1) y = df_dum.avg_salary.values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) """ Model 1: Mulitple Regression """ # Statsmodel version # link=https://www.statsmodels.org/devel/generated/statsmodels.regression.linear_model.OLS.html import statsmodels.api as sm X_sm = sm.add_constant(X_train) #make predictions using the training set model = sm.OLS(y_train, X_sm) model.fit().summary() # R-squared = 0.652 (e.g. model explains about 65% of the variance in salary estimate) ## SciKit version #link=https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html #link=https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html #link=https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html from sklearn.linear_model import LinearRegression, Lasso from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_squared_error, r2_score lm = LinearRegression()
#Model1 X = media[['Visitors', 'weekend']] X y = media['Views_show'] from sklearn.linear_model import LinearRegression lm = LinearRegression() lm.fit(X, y) lm.score(X, y) lm.coef_ #different library: Constant term has to be added here import statsmodels.api as sm #need to fit constant X = sm.add_constant(X) lm_1 = sm.OLS(y, X).fit() print(lm_1.summary()) #significnt variables - weekends, Visitors. P>|t| : .05 #2nd model : keep changing the combination of variables X = media[{'Visitors', 'weekend', 'Character_A'}] y = media['Views_show'] import statsmodels.api as sm X = sm.add_constant(X) lm_2 = sm.OLS(y, X).fit() print(lm_2.summary()) #like this keep creating model. Model which has higher AdjR2 is better. #see more from here #(https://www.kaggle.com/ashydv/media-company-case-study-linear-regression)
reg = LinearRegression() reg.fit(x, y) #reg.coef_ and reg.intercept_ calculates the value of slope and C print("The linear regression model is Y={:.5}X+ {:.5}".format( reg.coef_[0][0], reg.intercept_[0])) #now creating predictions predictions = reg.predict(x) plt.figure(figsize=(12, 6)) plt.scatter(data["User"], data["ratings"], c='red') plt.plot(data["User"], predictions, c='red', linewidth=2) plt.xlabel("USERS") plt.ylabel("RATINGS OUTOF 5") plt.show #Now accessig efficiency using RSquared model x = data['User'] y = data['ratings'] x2 = sm.add_constant(x) est = sm.OLS(y, x2) est2 = est.fit() print(est2.summary()) print("Enter user no.") p = int(input()) rate = p * reg.coef_[0][0] + reg.intercept_[0] print("User will rate", rate)
aX = np.asarray(df_fradulent['InscClaimAmtReimbursed']) aY = np.asarray(df_fradulent['TotalClaimCost']) regr = linear_model.LinearRegression() regr.fit(aX.reshape(-1, 1), aY.reshape(-1, 1)) plt.scatter(aX, aY, marker='+') plt.plot(aX, regr.predict(aX.reshape(-1, 1))) plt.xlabel("InscClaimAmtReimbursed") plt.ylabel("TotalClaimCost") plt.show() # In[209]: aX = np.asarray(df_fradulent['InscClaimAmtReimbursed']) aY = np.asarray(df_fradulent['TotalClaimCost']) aX = sm.add_constant(aX) model = sm.OLS(aY, aX) results = model.fit() print(results.summary()) # In[210]: df = pd.DataFrame(results.resid) df.describe() # In[211]: plt.style.use('seaborn') plt.rc('font', size=14) plt.rc('figure', titlesize=18)
import pandas_datareader.data as pdr import numpy as np import pandas_datareader.data as web start='1971/12/1' end='2016/8/31' workpop = web.DataReader('LFWA64TTJPM647S',"fred",start,end).dropna() gdp = web.DataReader('MKTGDPJPA646NWDB',"fred",start,end).dropna() gdp=gdp.resample('A',loffset='-1d').last().dropna() fx = web.DataReader('DEXJPUS',"fred",start,end).dropna() fx=fx.resample('A',loffset='-1d').last().dropna() workpop=workpop['1972':].resample('A',loffset='-1d').last().dropna() gdpjpy=gdp.MKTGDPJPA646NWDB*fx.DEXJPUS gdpjpy=np.log(gdpjpy).dropna() workpop=np.log(workpop).dropna() import statsmodels.api as sm x=sm.add_constant(gdpjpy) model=sm.OLS(gdpjpy,x) results=model.fit() print(results.summary())
f2xcoef = np.array([[ 0.1, 3., 1., 0.], [ 0., 0., 1.5, 0.1], [ 3., 2., 1., 0.]]) x0 = np.dot(f0, f2xcoef) x0 += 0.1*np.random.normal(size=x0.shape) ytrue = np.dot(f0,[1., 1., 1.]) y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape) xred, fact, eva, eve = pca(x0, keepdim=0) print eve print fact[:5] print f0[:5] import statsmodels.api as sm res = sm.OLS(y0, sm.add_constant(x0, prepend=False)).fit() print 'OLS on original data' print res.params print res.aic print res.rsquared #print 'OLS on Factors' #for k in range(x0.shape[1]): # xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # fact_wconst = sm.add_constant(fact) # res = sm.OLS(y0, fact_wconst).fit() # print 'k =', k # print res.params # print 'aic: ', res.aic # print 'bic: ', res.bic # print 'llf: ', res.llf
raw_data = pd.read_csv('../data/logistic-regression/Admittance.csv') print(raw_data.head()) data = raw_data.copy() data['Admitted'] = data['Admitted'].map({'Yes': 1, 'No': 0}) print(data.head()) y = data['Admitted'] x1 = data['SAT'] #plot with logistic regression x = sm.add_constant(x1) reg_log = sm.Logit(y, x) results_log = reg_log.fit() def f(x, b0, b1): return np.array(np.exp(b0 + x * b1) / (1 + np.exp(b0 + x * b1))) f_sorted = np.sort(f(x1, results_log.params[0], results_log.params[1])) x_sorted = np.sort(np.array(x1)) plt.scatter(x1, y, color='C0') plt.xlabel('SAT', fontsize=10) plt.ylabel('Admitted', fontsize=10) plt.plot(x_sorted, f_sorted, color='C0')
#!/usr/bin/env python3 import pandas as pd import numpy as np import statsmodels.api as sm if __name__ == '__main__': # read data file df = pd.read_csv('data.csv') # remove outliers based on iqr iqr = np.subtract(*np.percentile(df.y, [75, 25])) median = np.median(df.y) df = df[abs(df.y - median) <= 3 * iqr] # fit a simple ordinary least squares model x = sm.add_constant(df.x) lm = sm.OLS(df.y, x).fit() # display results print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x)) print('AIC: {:.2f}'.format(lm.aic)) print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
cnd_nan = np.isnan(S_D_cand) S_D_cand[cnd_nan] = 0.5 # correct the NaN value of correlation if ((bi-ai)*(bj-aj) < (wint*2+1)*(wint*2+1)): # not an integrate window D_D_cand = 1.0+np.power(np.square(i-x_cand)+np.square(j-y_cand),0.5)/opts.hwid # spatial distance else: D_D_cand = D_D_all[cnd_cand] # integrate window C_D = (1.0-S_D_cand)*D_D_cand+0.0000001 # combined distance weight = (1.0/C_D)/np.sum(1.0/C_D) for iband in range(nb): # compute V fine_cand = np.hstack(((fine1[iband,aj:bj,ai:bi])[cnd_cand],(fine2[iband,aj:bj,ai:bi])[cnd_cand])) coarse_cand = np.hstack(((coarse1[iband,aj:bj,ai:bi])[cnd_cand],(coarse2[iband,aj:bj,ai:bi])[cnd_cand])) coarse_change = abs(np.nanmean((coarse1[iband,aj:bj,ai:bi])[cnd_cand])-np.nanmean((coarse2[iband,aj:bj,ai:bi])[cnd_cand])) if (coarse_change >= opts.dn_max*0.02): # to ensure changes in coarse image large enough to obtain the conversion coefficient regress_result = sm.OLS(fine_cand,sm.add_constant(coarse_cand)).fit() sig = 1.0-stats.f.cdf(regress_result.fvalue,1,number_cand*2-2) # correct the result with no significancy or inconsistent change or too large value if (sig <= 0.05) and (regress_result.params[1] > 0) and (regress_result.params[1] <= 5): V_cand = regress_result.params[1] else: V_cand = 1.0 else: V_cand = 1.0 # compute the temporal weight difc_pair1 = np.abs(np.nanmean((coarse0[iband,aj:bj,ai:bi])[cnd_wind_valid])-np.nanmean((coarse1[iband,aj:bj,ai:bi])[cnd_wind_valid]))+0.01**5 difc_pair2 = np.abs(np.nanmean((coarse0[iband,aj:bj,ai:bi])[cnd_wind_valid])-np.nanmean((coarse2[iband,aj:bj,ai:bi])[cnd_wind_valid]))+0.01**5 T_weight1 = (1.0/difc_pair1)/(1.0/difc_pair1+1.0/difc_pair2) T_weight2 = (1.0/difc_pair2)/(1.0/difc_pair1+1.0/difc_pair2)
# random search for MLE starting parameters. Because Markov switching models # are often characterized by many local maxima of the likelihood function, # performing an initial optimization step can be helpful to find the best # parameters. # # Below, we specify that 20 random perturbations from the starting # parameter vector are examined and the best one used as the actual starting # parameters. Because of the random nature of the search, we seed the random # number generator beforehand to allow replication of the result. mod_filardo = sm.tsa.MarkovAutoregression( dta_filardo.iloc[2:]['dlip'], k_regimes=2, order=4, switching_ar=False, exog_tvtp=sm.add_constant(dta_filardo.iloc[1:-1]['dmdlleading'])) np.random.seed(12345) res_filardo = mod_filardo.fit(search_reps=20) res_filardo.summary() # Below we plot the smoothed probability of the economy operating in a # low-production state, and again include the NBER recessions for # comparison. fig, ax = plt.subplots(figsize=(12, 3)) ax.plot(res_filardo.smoothed_marginal_probabilities[0]) ax.fill_between(usrec.index, 0,
# DO NOT EDIT # # Robust Linear Models from __future__ import print_function import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.sandbox.regression.predstd import wls_prediction_std # ## Estimation # # Load data: data = sm.datasets.stackloss.load() data.exog = sm.add_constant(data.exog) # Huber's T norm with the (default) median absolute deviation scaling huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT()) hub_results = huber_t.fit() print(hub_results.params) print(hub_results.bse) print( hub_results.summary( yname='y', xname=['var_%d' % i for i in range(len(hub_results.params))])) # Huber's T norm with 'H2' covariance matrix hub_results2 = huber_t.fit(cov="H2")
def linear_fit(x, y, constant=True): if constant: x = _sm.add_constant(x) fit = _sm.OLS(y, x).fit() out = (fit.params, fit.fittedvalues, fit.resid) return out
# train test split from sklearn.model_selection import train_test_split X = df_dum.drop('avg_salary', axis=1) y = df_dum.avg_salary.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # multiple linear regression import statsmodels.api as sm X_sm = X = sm.add_constant(X) model = sm.OLS(y, X_sm) model.fit().summary() from sklearn.linear_model import LinearRegression, Lasso from sklearn.model_selection import cross_val_score lm = LinearRegression() lm.fit(X_train, y_train) np.mean( cross_val_score(lm, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))
def linreg(x,y): x = sm.add_constant(x) model = regression.linear_model.OLS(y,x).fit() X = x[:,1] return model.params[0], model.params[1]
def make_a_feature_list(data, feat): X = data[feat] Y = sm.add_constant(X) # add a constant return Y