def _fit_backward(self): y_train = pd.Series(self._model.model.endog.copy(), name=self.dependent_variable, index=self._observations_idx) X_train = pd.DataFrame(self._model.model.exog, columns=self._model.model.exog_names, index=self._observations_idx) model = OLS(y_train, X_train, missing='drop') results = model.fit() max_pvalue = results.pvalues.drop('Intercept').max() while max_pvalue > self.sig_level_removal: x_to_drop = results.pvalues.drop('Intercept').idxmax() X_train = X_train.drop(x_to_drop, axis=1) model = OLS(y_train, X_train, missing='drop') results = model.fit() max_pvalue = results.pvalues.drop('Intercept').max() self._model = results return
def remove_outliers(train, targetField, dropVal, studentResid, verbose=True): """ Remove outliers from training data based on statsmodels OLS Fit studentized residuals and specified drop values across features :param pandas.DataFrame train: data for training :param str targetField: target from train/ test :py:class:`pandas.DataFrame` :param obj dropVal: value to drop rows across :param float studentResid: number to threshold absolute value of student residuals above :param bool verbose: flag to print out OLS summary information and number of outlier removed """ train = train.dropna() if dropVal is not None: train = train.ix[(train.T != dropVal).all()] design = train[[i for i in train if i != targetField]] target = train[targetField] design = StandardScaler().fit_transform(design) model = OLS(target, design) mask = np.ones((train.shape[0])).astype(bool) if studentResid is not None: mask = (model.fit().outlier_test()['student_resid'].abs() < 2) if verbose: print model.fit().summary() print 'Removed:' + str(train.shape[0] - sum(mask)) return train.ix[mask]
def calc_gwi(obs,obs_years,reg_type='mon',base_low=1850.,base_high=1900, name=''): #Express the observations relative to the base period obs = obs - np.mean(obs[np.logical_and(obs_years>=base_low,obs_years<(base_high+1))]) #Load the best estimate forcings from Piers forc_file = './Data/Annualforcings_Mar2014_GHGrevised.txt' data = np.genfromtxt(forc_file,skip_header=4) years = data[:,0] tot_forc = data[:,13] ant_forc = data[:,14] #Integrate anthropogenic and natural forcing with standard FAIR parameters C, t_nat = fair_scm(other_rf=tot_forc-ant_forc) C, t_anthro = fair_scm(other_rf=ant_forc) #Express relative to the centre of the base period t_nat = t_nat - np.mean(t_nat[np.logical_and(years>=base_low,years<base_high+1)]) t_anthro = t_anthro - np.mean(t_anthro[np.logical_and(years>=base_low,years<base_high+1)]) # ----------------------------------------------- # Prepare the temperatures run through FaIR, so they lie on same year-grid as observations, so they can be compared # ----------------------------------------------- #Interpolate the annual forced responses to the grid of the observed data if reg_type !='mon': t_nat = np.interp(obs_years+0.5, years+0.5, t_nat) t_anthro = np.interp(obs_years+0.5, years+0.5, t_anthro) else: t_nat = np.interp(obs_years, years+0.5, t_nat) t_anthro = np.interp(obs_years, years+0.5, t_anthro) #Linearly project the final half year t_anthro[obs_years>(years[-1]+0.5)] = 12*(t_anthro[obs_years<=(years[-1]+0.5)][-1] - t_anthro[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \ +t_anthro[obs_years<=(years[-1]+0.5)][-1] t_nat[obs_years>(years[-1]+0.5)] = 12*(t_nat[obs_years<=(years[-1]+0.5)][-1] - t_nat[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \ +t_nat[obs_years<=(years[-1]+0.5)][-1] # ----------------------------------------------- #Use scipy defined OLS regression function to complete OLD regression of observations data on natural and anthropogenic warming with a constant y = np.copy(obs) x = DataFrame({'x1': (t_anthro), 'x2': (t_nat)}) # add constant vector on to dataframe we will fit to temp observations x = statsmodels.tools.tools.add_constant(x) # complete OLS regression of anthropogenic and natural temperatures (found from FaIR integrated best estimate forcing) onto given observed temperature dataset. model = OLS(y, x) result = model.fit() # collect output scaling factors for anthro and natural temperature timeseries sf = result.params #Form scaled anthropgenic warming index awi = t_anthro * sf['x1'] #Scaled natural warming index nwi = t_nat * sf['x2'] #Scaled total externally forced warming index gwi = awi + nwi print(name, ' AWI scale factor: ', sf['x1'], '\n', name, ' NWI scale factor: ', sf['x2']) return awi, nwi
def alpha_beta(self): rr = (self.X - 1).mean(1) m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T) reg = m.fit() alpha, beta = reg.params.const * 252, reg.params.x1 return alpha, beta
def linear_regression(data): """ goal of this function : - to apply a linear regression ; ie. to calculate the coefficient and the intercept value of the regression line input parameter : - json file's content (data) output : - dict containing the coefficient value and intercept for each word cmd packages : - numpy (ones, arange) - statsmodels.api (ols) """ #initialisation dict_linreg = {} #for each entry in the json file (data) #intercept value and coefficient calculation for k, v in data.items(): mat_x = np.ones((len(v), 2)) mat_x[:, 1] = np.arange(0, len(v)) reg = OLS(v, mat_x) results = reg.fit() dict_linreg[k] = [results.params[1], results.params[0]] return (dict_linreg)
def testPow(n): raw_X = trainData.OverallQual.values.reshape(-1, 1) OLS_y = trainData.SalePrice X = raw_X**n features = sm.add_constant(X) ols_sm = OLS(OLS_y.values, features) model = ols_sm.fit() return model.rsquared
def _capm(self): rfr = self.rf_rate / self.freq() rr = self.ucrp_r - rfr if 'CASH' in self.B.columns: cash = self.B.CASH else: cash = 0 m = OLS(self.r - 1 - (1 - cash) * rfr, np.vstack([np.ones(len(self.r)), rr - 1]).T) return m.fit()
def stats_models(self, X_train, y_train, show_summary=False): ''' perform OLS from stats model return model results ''' X = sm.add_constant(X_train) model_stats = OLS(y_train, X) results_stats = model_stats.fit() if show_summary: results_stats.summary() return results_stats
def intermediate(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns, index=data.index) if not indep_vars: raise errors.UserError('No covariables selected.') # Distributed linear regression only works for continuous variables if utils.is_nominal(dep_var): raise errors.UserError( 'Dependent variable must be continuous in distributed mode. Use SGD Regression for ' 'nominal variables instead.') if data.empty: logging.warning('All values are NAN, returning zero values') result = { 'summary': {}, 'columns': [], 'means': 0, 'X^T * X': 0, 'count': 0, 'scale': 0, } else: # Compute linear-regression X.insert(loc=0, column='intercept', value=1.) lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) output = format_output(flm) result = { 'summary': output, 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.values.dot(X.values), 'count': len(X), 'scale': flm.scale, } # Store results io_helper.save_results(json.dumps(result), 'application/json')
def est_via_ols(self): """ Estimate average treatment effects with Linear Regression. """ regressor = np.zeros((self.data.n, 1 + self.data.X.shape[1])) regressor[:, 0] = self.data.Z regressor[:, 1:] = self.data.X ols_model = LinearRegression(self.data.Y, regressor) reg_results = ols_model.fit() ate = reg_results.params[0] se = np.sqrt(reg_results.HC0_se[0]) return self._get_results(ate, se)
def half_life(spread): lag = spread.shift(1) lag.iloc[0] = lag.iloc[1] ret = spread - lag ret.iloc[0] = ret.iloc[1] lag2 = add_constant(lag) model = OLS(ret, lag2) res = model.fit() halflife = int(round(-log(2) / res.params[1], 0)) if halflife <= 0: halflife = 1 return halflife
def get_half_life_from_scratch(stockX, stockY, beta, df_is): # called in get_df_coint z_array = get_z(stockX, stockY, beta, df_is) z_lag = np.roll(z_array, 1) z_lag[0] = 0 z_ret = z_array - z_lag # adds intercept terms to X for regression z_lag2 = add_constant(z_lag) model = OLS(z_ret, z_lag2) res = model.fit() return int(-np.log(2) / res.params[1])
def alpha_analysis(y, x, parameters, name_parameters, latex_name_parameters, name_fig, CI=True): alphas = [] pvalues = [] rsquared_adj = [] s = [] for k in range(0, y.shape[0]): model = OLS(endog=y[k], exog=x) # no intercept by default fitted = model.fit() alphas.append(*fitted.params) pvalues.append(*fitted.pvalues) rsquared_adj.append(fitted.rsquared_adj) s.append(fitted.cov_HC0[0, 0]) df = DataFrame({ name_parameters: parameters, 'alpha': alphas, 'p-value': pvalues, 'R_{adj}': rsquared_adj }) df = df[[name_parameters, 'alpha', 'R_{adj}', 'p-value']] # latex_table = df.to_latex(index=False) alphas = array(alphas) s = array(s) fig = figure() plot(parameters, alphas, 'blue', label=r'$\alpha$') if CI: CI_up = alphas + t.ppf(0.975, len(x) - 1) * s CI_low = alphas - t.ppf(0.975, len(x) - 1) * s plot(parameters, CI_low, color='red') plot(parameters, CI_up, color='red', label='95% CI') legend() xlabel(latex_name_parameters, fontsize=14) ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) ylabel(r'$\alpha$', fontsize=14) grid(True, linestyle='--') xlim(xmin=min(parameters), xmax=max(parameters)) fig.savefig('pictures\cva' + '\\' + name_fig + '.png') return df
def fit_efficiency_model( p_in, p_out, p_in_density, efficiency, use_monthly_dummies=False, use_time=False ): # local import to suppress warning in unit tests, see: # https://github.com/statsmodels/statsmodels/issues/7139 from statsmodels.api import OLS from statsmodels.tools.tools import add_constant X = pd.DataFrame( { "p_in_density": p_in_density, } ) if use_time: # not really time, just a sequentially increasing number X["time"] = range(len(X)) if use_monthly_dummies: # we add a constant below, so we have to drop one month X = X.join(pd.get_dummies(p_in_density.time.dt.month, drop_first=True)) # other possible parameters: # - specific power # - turbine age X = add_constant(X) Y = efficiency.values model = OLS(Y, X) fit_result = model.fit() efficiency_without_pin = ( fit_result.params.const + fit_result.params.p_in_density * p_in_density.mean().values + fit_result.resid ) if use_time: efficiency_without_pin += fit_result.params.time * X["time"] # note: this might be broken if lengths of p_in and p_out do not match up assert len(p_in) == len(efficiency_without_pin), "input lengths do not match" efficiency_without_pin = xr.ones_like(p_in) * efficiency_without_pin return fit_result, efficiency_without_pin
def fit(xyz, xlim=None, ylim=None, zlim=None, **kwargs): all_true = numpy.empty_like(xyz[:,0], dtype=bool) \ if None in [xlim, ylim, zlim] \ else None xbool = numpy.abs(xyz[:,0]) < xlim if xlim else all_true ybool = numpy.abs(xyz[:,1]) < ylim if ylim else all_true zbool = numpy.abs(xyz[:,2]) < zlim if zlim else all_true bools = numpy.logical_and(numpy.logical_and(xbool, ybool), zbool) XYZ = xyz[bools,:] XY = add_constant(XYZ[:,:2], prepend=False) Z = XYZ[:,-1] model = OLS(Z, XY) result = model.fit() coeffs = result.params stderr = result.HC1_se return coeffs, stderr
def linear(data, **kwargs): '''linear regression model fitted with ordinary least squares Parameters ---------- data : array or dataframe first column is endogenous, second column is a column of ones, the rest are exogenous data ** Keyword Arguments ** prior_type : str 'uniform' or 'collinear adjusted dilution' Returns ------- rslts : array 1-d array of parameter coefficients ''' prior_type = kwargs.get('prior_type', 'uniform') endog = data[:, [0]] exog = data[:, 1:] model = OLS(endog=endog, exog=exog, missing='drop') adj = (np.cov(np.hstack((model.wexog, endog)), rowvar=0)[:-1, -1]/ \ np.var(endog)).reshape((-1, 1)) fit = model.fit() par_rsquared = fit.params.reshape((-1, 1)) * adj if prior_type == 'uniform': prior = 1. elif prior_type == 'collinear adjusted dilution': prior = collinear_adj_prior(exog) else: raise ValueError('prior {} not supported'.format(prior_type)) posterior = math.exp(fit.llf) * prior return np.hstack((fit.nobs, posterior, fit.rsquared, fit.params, fit.pvalues, fit.bse, par_rsquared.flat))
class OLSRegressor(BaseRegressor): degree = Property(depends_on='_degree') _degree = Int constant = None # _result = None # @on_trait_change('xs,ys') # def _update_data(self): # self._ols = OLS(self.xs, vander(self.ys, self.degree + 1)) # self._result = self._ols.fit() # def _xs_changed(self): # xs = asarray(self.xs) # ys = asarray(self.ys) # # print len(xs), len(ys) # self._ols = OLS(ys, vander(xs, self.degree + 1)) # self._result = self._ols.fit() def __degree_changed(self): self.calculate() def calculate(self): ''' vander is equivalent to sm.add_constant(np.column_stack((x**n,..x**2,x**1))) vander(x,n+1) ''' if not len(self.xs) or \ not len(self.ys): return if len(self.xs) != len(self.ys): return # xs = asarray(self.xs) ys = asarray(self.ys) # self._ols = OLS(ys, vander(xs, self.degree + 1)) # self._result = self._ols.fit() # print len(xs), len(ys) # print self.degree # print vander(xs, self.degree + 1) X = self._get_X() if X is not None: try: self._ols = OLS(ys, X) self._result = self._ols.fit() except Exception, e: print e
def linear(data, **kwargs): '''linear regression model fitted with ordinary least squares Parameters ---------- data : array or dataframe first column is endogenous, second column is a column of ones, the rest are exogenous data ** Keyword Arguments ** prior_type : str 'uniform' or 'collinear adjusted dilution' Returns ------- rslts : array 1-d array of parameter coefficients ''' prior_type = kwargs.get('prior_type', 'uniform') endog = data[:, [0]] exog = data[:, 1:] model = OLS(endog=endog, exog=exog, missing='drop') adj = (np.cov(np.hstack((model.wexog, endog)), rowvar=0)[:-1, -1]/ \ np.var(endog)).reshape((-1, 1)) fit = model.fit() par_rsquared = fit.params.reshape((-1,1))*adj if prior_type == 'uniform': prior = 1. elif prior_type == 'collinear adjusted dilution': prior = collinear_adj_prior(exog) else: raise ValueError('prior {} not supported'.format(prior_type)) posterior = math.exp(fit.llf)*prior return np.hstack((fit.nobs, posterior, fit.rsquared, fit.params, fit.pvalues, fit.bse, par_rsquared.flat))
def est_via_dml(self, outcome_model=OLS(), treatment_model=OLS()): Y = np.zeros(self.data.n) Xc = np.zeros((self.data.n, self.data.covariate_dims)) Z = np.zeros(self.data.n) G = np.zeros(self.data.n) Labels = np.zeros(self.data.n) size_max = max(list(self.data.data_by_size.keys())) idx = 0 for k, v in self.data.data_by_size.items(): y, z, g, xc, labels = v Y[idx:idx + len(y)] = y Xc[idx:idx + len(y)] = xc Z[idx:idx + len(y)] = z G[idx:idx + len(y)] = g Labels[idx:idx + len(y)] = labels idx += len(y) outcome_reg = outcome_model.fit(Xc, Y) treatment_reg = treatment_model.fit(Xc, Z) y_res = Y - outcome_reg.insample_predict() z_res = Z - treatment_reg.insample_predict() data = ClusterData(y_res, z_res, np.zeros((self.data.n, self.data.X.shape[1])), Labels, self.data.cluster_feature, self.data.n_moments, False) z_g_res = np.zeros((self.data.n, 2)) y_res = np.zeros(self.data.n) idx = 0 for k, v in data.data_by_size.items(): y, z, g, xc, labels = v y_res[idx:idx + len(y)] = y z_g_res[idx:idx + len(y), 0] = z z_g_res[idx:idx + len(y), 1] = g * z ols_model = LinearRegression(y_res, z_g_res) result = ols_model.fit() ret = {'beta(g)': np.zeros(size_max), 'se': np.zeros(size_max)} cov_HC0 = result.cov_HC0 for g in range(size_max): ret['beta(g)'][g] = result.params[0] + result.params[1] * g test_arr = np.array([1, g]) ret['se'][g] = np.sqrt(test_arr.dot(cov_HC0[:2, :2]).dot(test_arr)) return ret
def est_via_ols(self): y = np.zeros(self.data.n) regressor = np.zeros((self.data.n, 2 + self.data.covariate_dims)) size_max = max(list(self.data.data_by_size.keys())) idx = 0 for k, v in self.data.data_by_size.items(): Y, Z, G, Xc, labels = v y[idx:idx + len(Y)] = Y regressor[idx:idx + len(Y), 0] = Z regressor[idx:idx + len(Y), 1] = G * Z regressor[idx:idx + len(Y), 2:] = Xc idx += len(Y) ols_model = LinearRegression(y, regressor) result = ols_model.fit() ret = {'beta(g)': np.zeros(size_max), 'se': np.zeros(size_max)} cov_HC0 = result.cov_HC0 for g in range(size_max): ret['beta(g)'][g] = result.params[0] + result.params[1] * g test_arr = np.array([1, g]) ret['se'][g] = np.sqrt(test_arr.dot(cov_HC0[:2, :2]).dot(test_arr)) return ret
def alpha_analysis_hull(y, x, bS, bV, name_dataframe=''): alphas = [] pvalues = [] rsquared_adj = [] CI_up = [] CI_low = [] duplicate_b_S = [] duplicate_b_V = [] s = [] for k in range(0, y.shape[0]): for l in range(0, y.shape[1]): model = OLS(endog=y[k, l, ], exog=x) # no intercept by default fitted = model.fit() alphas.append(*fitted.params) pvalues.append(*fitted.pvalues) rsquared_adj.append(fitted.rsquared_adj) s.append(fitted.HC0_se[0]) duplicate_b_S.append(bS[k]) duplicate_b_V.append(bV[l]) s = array(s) CI_up = (alphas + t.ppf(0.975, len(x) - 1) * s) CI_low = (alphas - t.ppf(0.975, len(x) - 1) * s) df = DataFrame({ 'b_S': duplicate_b_S, 'b_V': duplicate_b_V, 'alpha': alphas, 'Standard Error': s, 'p-value': pvalues, 'R_{adj}': rsquared_adj, 'CI95_up': CI_up, 'CI95_low': CI_low }) df = df[[ 'b_S', 'b_V', 'alpha', 'Standard Error', 'R_{adj}', 'p-value', 'CI95_low', 'CI95_up' ]] if name_dataframe != '': save_dataframe(name_dataframe, df) return df
def linear_regression(data): reg_coeff = [] reg_intercept = [] dict_linreg = {} #intercept value and coefficient calculation for k, v in data.items(): mat_x = np.ones((len(v), 2)) mat_x[:, 1] = np.arange(0, len(v)) reg = OLS(v, mat_x) results = reg.fit() reg_coeff.append(results.params[1]) reg_intercept.append(results.params[0]) dict_linreg[k] = [results.params[1], results.params[0]] #r² value results.rsquared return (dict_linreg)
sum_of_squares = df['difference'].apply(square).sum() return(sum_of_squares) x0 = [-20, .0008, 1.1] estimator(x0) optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True}) clf = linear_model.LinearRegression() x = df[['AADT', 'L']].as_matrix() y = df['Crashes'] clf.fit(x, y) clf.coef_ clf.intercept_ model = OLS(y, add_constant(x)) model_fit = model.fit() model_fit.summary() def estimator(x, row_in='Crashes'): estimated = lambda row: exp(x[0] + x[1] * row['AADT'] + x[2] * row['L']) df['estimated'] = df.apply(estimated, axis=1) #probability = lambda row: (row['estimated']**row[row_in] * exp(-row['estimated'])) / factorial(row[row_in]) probability = lambda row: poisson.pmf(row[row_in], row['estimated']) df['probability'] = df.apply(probability, axis=1) product = df['probability'].product() return(-product) x0 = [1.6, .0000026, .032] estimator(x0) optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})
7711, 9692, 11791, 14380, 17205, 20438, 24324, 28018, 31161, 34546, 37198, ]) x = np.arange(len(confirmed)) x = add_constant(x) model = OLS(np.log(confirmed[:14]), x[:14]) result = model.fit() result.summary() plt.plot( np.exp(result.predict(x[:14])), label="Prédiction du fonction exp", ) plt.plot(confirmed[:14], ".", label="Cas réels, CN") plt.legend() plt.xlabel("jours") plt.ylabel("nombres de malades") plt.show() world_population = 7763252653 days = 0 infected = confirmed[14] while infected < world_population: days += 1
def _fit_regression(X, y): lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) metadata = {'summary': str(flm.summary()), 'summary2': str(flm.summary2())} return format_output(flm), metadata
def m(x, nu=0, gamma=1): return (0.5 * math.pi) * np.sinh(gamma) / (np.cosh(gamma) - np.cos(x - nu)) ###Create data X = np.random.multivariate_normal(np.ones(k), sigma, size=[ N, ]) U = np.random.standard_normal(size=[ 500, ]) V = np.random.standard_normal(size=[ 500, ]) Y = np.dot(theta, D) + g(np.dot(X, b)) + U D = m(np.dot(X, b)) + V OLS_model = OLS(Y, D) result = OLS_model.fit() ###Naive double machine learning naiveMl1 = RandomForestRegressor() # X -> Y naiveMl1.fit(X, Y) Vhat1 = Y - naiveMl1.predict(X) naiveMl2 = RandomForestRegressor() # X -> Y naiveMl2.fit(X, D) Vhat2 = D - naiveMl1.predict(X) np.mean(np.dot(Vhat1, Vhat2)) / np.mean(np.dot(Vhat2, D))
def _model(self, X, y): model = OLS(y, X) result = model.fit() print result.summary() return result
def linregress(df, X, y): dfX = _items(df, X) dfy = _value(df, y) model = OLS(dfy, dfX) result = model.fit() return result.summary()
import pandas as pd import numpy as np import matplotplib.pyplot as plt from statsmodels.api import OLS def fit_linear(df, columns) ''' Parameters: DF: Dataframe with y assumed to be sale_price X: List of columns to be used as predictors for sale_price ---------------------------- Returns: Prints summary and returns fit OLS model ''' y = df.saleprice X = df[columns] X = sm.add_constant(X) lr = OLS(y, X) model = lr.fit() print(model.summary()) return model
R = dailyret.iloc[ t - lookback + 1:t + 1, ].T # here the columns of R are the different observations. hasData = np.where(R.notna().all(axis=1))[0] R.dropna(inplace=True) # avoid any stocks with missing returns avgR = R.mean(axis=1) R = R.values - avgR.values.reshape( (R.shape[0], 1)) # subtract mean from returns covR = pd.DataFrame( R.T).cov() # compute covariance matrix, with observations in rows. B, X = eig( covR ) # X is the factor exposures matrix, B the variances of factor returns X = X[:, 0:numFactors] # Retain only numFactors model = OLS(R[:, -1], X) results = model.fit() b = results.params # b are the factor returns for time period t-1 to t. Rexp = avgR + np.dot( X, b ) # Rexp is the expected return for next period assuming factor returns remain constant. idxSort = Rexp.argsort() positionsTable[t, hasData[idxSort.values[np.arange(0, topN)]]] = -1 positionsTable[t, hasData[idxSort.values[np.arange(-topN, 0)]]] = 1 capital = np.nansum(np.array(abs(positionsTable).shift()), axis=1) positionsTable[capital == 0, ] = 0 capital[capital == 0] = 1 ret = np.nansum( np.array(pd.DataFrame(positionsTable).shift()) * np.array(dailyret), axis=1) / capital
class CrossSectionalModelLinear(CrossSectionalModelBase): def __init__(self, jsonPath=None, paraDict={}): self.parameter = paraDict if jsonPath is not None: with open(jsonPath, 'r') as f: self.parameter = json.loads(f) self.fit_intercept = self.parameter.get('fit_intercept', True) self.model = None def fit(self, X_train, y_train): if self.fit_intercept: X_train = sm.add_constant(X_train) self.model = OLS(y_train, X_train) self.res = self.model.fit() return self.res def predict(self, X): if self.fit_intercept: X = sm.add_constant(X) return self.res.predict(X) def get_para(self): if self.parameter != {}: return pd.DataFrame.from_dict(self.parameter, orient='index', columns=['ParaValue']) else: print('Hyper parameters are default') def get_model(self): try: return self.res except: print('fit your model first!') return None def get_score(self, y_real, **kwargs): ''' get score of the prediction based on the scoreMethod ---- y: y_real kwargs: scoreMethod: str 'r2': r2_score 'mse': mean_squared_error 'mae': mean_absolute_error X: ndarray, input X to get y_pred y_pred: input y_pred directly ''' if 'y_pred' in kwargs.keys(): y_pred = kwargs['y_pred'] elif 'X' in kwargs.keys(): y_pred = self.res.predict(kwargs['X']) def r2(y_real, y_pred): return r2_score(y_real, y_pred) def mse(y_real, y_pred): return mean_squared_error(y_real, y_pred) def mae(y_real, y_pred): return mean_absolute_error(y_real, y_pred) methodDict = {'r2': r2, 'mse': mse, 'mae': mae} scoreMethod = kwargs.get('scoreMethod', 'r2') scoreMethod = methodDict[scoreMethod] return scoreMethod(y_real, y_pred) def get_coef(self): ''' get estimated coefficients for the linear regression problem ''' return self.res.params def get_model_summary(self): ''' get summary of the model return ---- summary of model: coef, pvalue, t-statistics, R2, R2_adj... ''' return self.res.summary()
if compo_counter == 3: y_train[line_counter][0] = data[first_key][second_key] line_counter += 1 compo_counter = 0 else: x_train[line_counter][compo_counter] = data[first_key][second_key] compo_counter += 1 ## reshape datasets np.reshape(x_train, (-1, 1)) np.reshape(y_train, (-1, 1)) ## create and summary model model = OLS(y_train, x_train) y_pred = model.fit() #print(y_pred.summary()) ## predict the answer pred = y_pred.predict(x_train) ## calculate RSME line_counter = 0 while 1: data = json_data["data"] if len(data) <= line_counter: break loss[line_counter][0] = abs(y_train[line_counter][0] - int(pred[line_counter])) line_counter += 1
sectors = X.iloc[:, -10:] X = (X.drop(sectors.columns, axis=1) .groupby(level='ticker') .transform(lambda x: (x - x.mean()) / x.std()) .join(sectors) .fillna(0)) # ### 1-Day Returns # In[14]: target = 'target_1d' model = OLS(endog=y[target], exog=add_constant(X)) trained_model = model.fit() print(trained_model.summary()) # ### 5-Day Returns # In[21]: target = 'target_5d' model = OLS(endog=y[target], exog=add_constant(X)) trained_model = model.fit() print(trained_model.summary()) # #### Obtain the residuals
def fit_ols(y, x, idx=-1): ols = OLS(y, add_constant(x)) results = ols.fit() return results.params.values[idx], results.cov_params().values[idx, idx]
def linregress_loose(X, y, *args, **kwargs): X = list(zip(*(_series(x) for x in X))) y = _series(y) model = OLS(y, X) result = model.fit(*args, **kwargs) return result.summary()
import statsmodels.api as sm # In[65]: ols=OLS(timevncats,sm.add_constant(X)) # In[66]: ols=ols.fit() nclients=Clientes.shape[0] predtime=(ols.predict([1,nclients,nclients**2])/60/60)[0] print('Full data set should take %i hours' % int(predtime))