def _fit(self): """ makes model fit. the model is fitted according to the outcome of levene test. GLS is used dependent on the outcome of the test weight matrix is calculated. returns statsmodels gls object. For further manipulation with object see statsmodels documentation. """ p = self._levene_test() if p > 0.05: self.weights = np.repeat(1, len(self.data[self.x])) print('Variance is homogenious, assuming OLS') self.c, self.d = 1, 1 self.model_type = 'OLS' else: print('Variance is not homogenious, assuming WLS') self.model_type = 'WLS' self.c, self.d = self._get_weights() self.weights = (self.c + self.d * self.data[self.x]) ** 2 model = gls(f'{self.y}~{self.x}', data=self.data, sigma=self.weights).fit() return model
def best_formula(dataframe, response): remaining = set(dataframe.columns) remaining.remove(response) selected, results = [], [] while remaining: scores_with_candidates = [] for candidate in remaining: formula = '{} ~ {}'.format(response, ' + '.join(selected + [candidate])) lm = smf.gls(formula, sm.add_constant(dataframe)).fit() score = lm.rsquared_adj scores_with_candidates.append( (score, candidate, [formula, lm.rsquared_adj, lm.ssr])) scores_with_candidates.sort() best_score, best_candidate, best_metrics = scores_with_candidates.pop() results.append(best_metrics) remaining.remove(best_candidate) selected.append(best_candidate) dataframe = pd.DataFrame(results) dataframe.columns = ['formula', 'adjr2', 'ssr'] dataframe = dataframe.sort_values('adjr2', axis=0, ascending=False)[:1].reset_index() formula = str(dataframe['formula'].values).replace('[', '').replace( ']', '').replace("'", '') return formula, dataframe
def adl_regression(dataframe, target_variable, lags_dict=None, cov_type='nonrobust', model_type='gls'): """ param dataframe: df содержащий необходимые переменные param target_variable: (str) название зависимой переменной param lags_dict: (dict) словарь: ключи - названия (str) колонок в dataframe, которые будут использоваться для регрессии, значения (list) - списки лагов для каждой из переменных param cov_type: ['nonrobust', 'HC0', 'HC1', 'HC2', 'HC3'] param model_type: ['ols', 'gls'] """ if lags_dict is not None: dat_dict = {} max_lag = 0 for varname in lags_dict: m_l = np.max(lags_dict[varname]) if m_l > max_lag: max_lag = m_l target_variable_array = dataframe[target_variable].values[max_lag:] dat_dict.update({'target': target_variable_array}) target_len = len(target_variable_array) colnames_list = ['target'] for varname in lags_dict: variable_len = len(dataframe[varname]) for lag in lags_dict[varname]: dat_dict.update({ varname + '_{}'.format(lag): dataframe[varname].values[max_lag - lag:variable_len - lag] }) colnames_list.append('{}_{}'.format(varname, lag)) data_for_regression = pd.DataFrame(dat_dict, columns=colnames_list) else: data_for_regression = dataframe formula = ' '.join([ '{}'.format(varname) + ' + ' for varname in data_for_regression.columns[:-1] ])[:-3] if model_type == 'ols': model = smf.ols('target ~ ' + formula, data=data_for_regression).fit(cov_type=cov_type) else: model = smf.gls('target ~ ' + formula, data=data_for_regression).fit() return model
def runModel(experiment, data, dependentVariable, independentVariables, regressionType='ols'): import statsmodels.formula.api as smf modelStr = modelString(experiment, dependentVariable, independentVariables) if regressionType == 'ols': model = smf.ols(modelStr, data=data) elif regressionType == 'gls': model = smf.gls(modelStr, data=data) elif regressionType == 'rlm': model = smf.rlm(modelStr, data=data) else: print('Unknown regression type {}. Exiting'.format(regressionType)) import sys sys.exit() return model.fit()
def cpgls(responseList, intercept, phyCovMatrix, subMatrix, colVector, subWeight=1, colnum=0): # colnum is dummy for now exog = np.array([ aa2vec(aa, aalphabet) for aa in colVector.flatten().tolist() ]) # exogenous "STATE matrix": n rows x 20 cols covMatrix = adjustCovMatrix(phyCovMatrix, subMatrix, exog, subWeight) # using the formula API data = pd.DataFrame() #data["response"] = responseList # forces intercept thru 0 # fix the intercept at the passed value data["response"] = [response - intercept for response in responseList] data["aa"] = colVector.flatten() return smf.gls(formula=("response ~ aa + 0"), data=data, sigma=covMatrix)
def gls_formula(data, xseq, **params): """ Fit GLL using a formula """ eval_env = params['enviroment'] formula = params['formula'] init_kwargs, fit_kwargs = separate_method_kwargs(params['method_args'], sm.GLS, sm.GLS.fit) model = smf.gls(formula, data, eval_env=eval_env, **init_kwargs) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(data) if params['se']: _, predictors = dmatrices(formula, data, eval_env=eval_env) alpha = 1 - params['level'] prstd, iv_l, iv_u = wls_prediction_std(results, predictors, alpha=alpha) data['se'] = prstd data['ymin'] = iv_l data['ymax'] = iv_u return data
Virat = df[df["Player"] == "V Kohli"] Vir_runs = Virat["Runs"].astype(int) Vir_min = Virat["Minutes"].astype(int) Vir_balls = Virat["Balls Faced"].astype(int) Vir_fours = Virat["Fours"].astype(int) Vir_sixes = Virat["Sixes"].astype(int) Vir_sr = Virat["Strike Rate"].astype(float) print np.mean(Vir_min) print np.mean(Vir_balls) print np.mean(Vir_fours) print np.mean(Vir_sixes) print np.mean(Vir_sr) print np.mean(Vir_runs) #Some cool Visualizations plt.scatter(Vir_runs, Vir_balls, color='red', alpha=0.5, s= Vir_fours*100, facecolor = "white") plt.scatter(Vir_runs, Vir_min, color='red', alpha=0.5, s= Vir_balls*10, facecolor = "white") plt.scatter(Vir_runs, Vir_balls, color='red', alpha=0.5, s= Vir_sr*10, facecolor = "white") #Dependency of runs on some factors import statsmodels.formula.api as smf est = smf.gls(formula='Vir_runs ~ Vir_balls + Vir_fours + Vir_sixes + Vir_sr', data=Virat).fit() est.summary() est = smf.gls(formula='Vir_runs ~ Vir_balls + Vir_fours + Vir_sixes + Opposition', data=Virat).fit() est.summary()
def add_covariate_model(self, label, covariate, model, restriction=None, recode=None, var_type='binary', print_results=True): """Add a specified regression model for time-varying confounders. Unlike the exposure and outcome models, a covariate model does NOT have to be specified. Additionally, *n* covariate models can be specified for *n* time-varying covariates. Additional models are added by repeated calls for this function with the corresponding covariates and predictive regression equations Parameters ---------- label : int Integer label for the covariate model. Covariate models are fit in ascending order within TimeVaryGFormula covariate : str Column label for time-varying confounder to be predicted model : str Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format follows patsy For example) 'var1 + var2 + var3 + var4' restriction : str, optional Used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1" recode : str, optional This variable is vitally important for various functional forms implemented later in models. This is used to run some background code to recreate functional forms as the g-formula is estimated via fit() For an example, let's say we have age but we want the functional form to be quadratic. For this, we would set the recode="g['age_sq'] = g['age']**2;" Similar to TimeFixedGFormula, 'g' must be specified as the DataFrame object with the corresponding indexes. Also lines of executable code should end with ';', so Python knows that the line ends there. My apologies for this poor solution... I am working on a better way. In the background, Python executes the code input into recode var_type : str, optional Type of variable that the covariate is. Current options include 'binary' or 'continuous' print_results : bool, optional Whether to print the logistic regression model results to the terminal. Default is True """ if type(label) is not int: raise ValueError('Label must be an integer') # Building predictive model g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() if self._weights is None: # Unweighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial() m = smf.glm(covariate + ' ~ ' + model, g, family=linkdist) elif var_type == 'continuous': m = smf.gls(covariate + ' ~ ' + model, g) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) else: # Weighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial() m = smf.glm(covariate + ' ~ ' + model, g, freq_weights=g[self._weights], family=linkdist) elif var_type == 'continuous': m = smf.wls(covariate + ' ~ ' + model, g, weights=g[self._weights]) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) f = m.fit() if print_results: print( '==============================================================================' ) print('Covariate (' + str(covariate) + ') Model') print(f.summary()) print( '==============================================================================' ) # Adding to lists, it is used to predict variables later on for the time-varying... self._covariate_models.append(f) self._covariate_model_index.append(label) self._covariate.append(covariate) self._covariate_type.append(var_type) if recode is None: self._covariate_recode.append( 'None') # Must be string for exec() to use later else: self._covariate_recode.append(recode)
def linear_new(types, intput): np.random.seed(9876789) df = pd.read_csv(intput, index_col=False) print(df) print(df.columns[:-1]) feature = df.columns[:-1] s1 = ' + '.join(feature) s2 = df.columns[-1] s = s2 + " ~ " + s1 if types == "ols": results = smf.ols(s, data=df).fit(use_t=True) elif types == "gls": results = smf.gls(s, data=df).fit(use_t=True) elif types == "glsar": results = smf.glsar(s, data=df).fit(use_t=True) elif types == "wls": results = smf.wls(s, data=df).fit(use_t=True) else: print("No this type!!!") exit(0) print( "**********************************************************************************\n" ) alpha = 0.05 print(results.summary()) data_t = { "coef": results.params, "std err": results.bse, "t": results.tvalues, "P>|t|": results.pvalues, "[" + str(alpha / 2.0): results.conf_int(alpha)[0], str(1 - alpha / 2.0) + "]": results.conf_int(alpha)[1] } sdata_df = pd.DataFrame(data_t) print(sdata_df) sdata_df.to_csv("out/data1.csv") from statsmodels.stats.stattools import (jarque_bera, omni_normtest, durbin_watson) jb, jbpv, skew, kurtosis = jarque_bera(results.wresid) omni, omnipv = omni_normtest(results.wresid) title = [ "Model", "R-squared", "Adj. R-squared", "F-statistic", "Prob (F-statistic)", "Log-Likelihood", "AIC", "BIC", "Omnibus", "Prob(Omnibus)", "Skew", "Kurtosis", "Durbin-Watson", "Jarque-Bera (JB)", "Prob(JB)", "Cond. No." ] value = [ results.model.__class__.__name__, results.rsquared, results.rsquared_adj, results.fvalue, results.f_pvalue, results.llf, results.aic, results.bic, omni, omnipv, skew, kurtosis, durbin_watson(results.wresid), jb, jbpv, results.diagn['condno'] ] datadf = {"title": np.array(title), "value": np.array(value)} select_df = pd.DataFrame(datadf) print(select_df) select_df.to_csv("out/data2.csv") # 画1D或者3D图形 predicted = results.predict(df) import matplotlib.pyplot as plt if len(feature) == 1: x = np.array(df[feature]).reshape(-1, 1) y = np.array(df[s2]).reshape(-1, 1) plt.figure(facecolor='white', figsize=(10, 5)) plt.scatter(x, y, marker='x') plt.plot(x, predicted, c='r') title = 'The Linear Graph of One Dimension' # 绘制x轴和y轴坐标 plt.xlabel(feature[0]) plt.ylabel(s2) plt.title(title) plt.grid() plt.savefig("out/plot_out.png", format='png') elif len(feature) == 2: from mpl_toolkits.mplot3d import Axes3D ax1 = plt.axes(projection='3d') x = np.array(df[feature[0]]).reshape(-1, 1) y = np.array(df[feature[1]]).reshape(-1, 1) z = np.array(df[s2]).reshape(-1, 1) ax1.scatter3D(x, y, z, cmap='Blues') # 绘制散点图 ax1.plot3D(x, y, predicted, 'gray') # 绘制空间曲线 ax1.set_xlabel(feature[0]) ax1.set_ylabel(feature[1]) ax1.set_zlabel(s2) plt.savefig("out/plot_out.png", format='png') else: print("The number of feature is big than 2 ,no plot!") return
import pandas as pd from statsmodels.formula.api import gls import seaborn as sns import matplotlib.pyplot as plt data = pd.read_csv("optimal_data_frame.csv") print(data.head()) group = data.groupby(["block", "condition"]) model = gls("step ~ timestep + block", data[data["condition"] == "random"]).fit() print(model.summary()) model = gls("reaction_time ~ timestep + block", data[data["condition"] == "random"]).fit() print(model.summary()) model = gls("normalized_reaction_time ~ timestep + block", data[data["condition"] == "random"]).fit() print(model.summary()) # model = gls("step ~ timestep + block", data[data["condition"] == "block"]).fit() # print(model.summary()) # # model = gls("reaction_time ~ timestep + block", data[data["condition"] == "block"]).fit() # print(model.summary()) # # model = gls("normalized_reaction_time ~ timestep + block", data[data["condition"] == "block"]).fit() # print(model.summary()) # model = gls("optimal_p ~ timestep + block",
def fit(self, df, formula): return smf.gls(formula=formula, data=df).fit()
def main(filename): results = pd.read_csv( f"{params.ROOT}/../data/{filename}.csv", dtype={ "Place (Overall)": "Int64", "Place (Gender)": "Int64", "Place (Category)": "Int64", "Name": str, "Sex": str, "Club": str, "Running Number": object, "Category": "category", "Year": "Int64", "Country": str, "FirstName": str, "LastName": str, "DSQ": bool, "Finish (Total Seconds)": "float64", }, parse_dates=["Finish"], ) results["Finish"] = pd.to_timedelta(results["Finish"]) # Basic plotting sns.violinplot(data=results, x="Sex", y="Finish (Total Seconds)") plt.savefig(f"{params.ROOT}/../plots/london_violin.png") # Try explanatory linear regression with statsmodels mod = smf.gls(formula='Q("Finish (Total Seconds)") ~ Sex' "+ Category", data=results) res = mod.fit() print(res.summary()) # Try sklearn linear regression # Get label and value arrays of interest, get rid of NaNs X = results[["Sex", "Category"]] X = X.fillna(X.mode()) y = results["Finish (Total Seconds)"] y = y.fillna(y.mean()) # Change categorical variables using onehot to 1/0+ enc = preprocessing.OneHotEncoder() X_transform = enc.fit_transform(X) # Sample data X_train, X_test, y_train, y_test = train_test_split(X_transform, y, test_size=0.2) regressor = LinearRegression() regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred}) print(df.head()) # Evaluate algorithm print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred)) print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred)) print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
from sklearn.datasets import load_boston import pandas as pd import statsmodels.formula.api as sm boston_data = load_boston() boston = pd.DataFrame(data=boston_data.data, columns=boston_data.feature_names) boston['target'] = boston_data.target train = boston.sample(frac=0.8, random_state=200) test = boston.drop(train.index) result = sm.gls(formula= 'target ~ CRIM + ZN +CHAS + NOX + RM + DIS + RAD + TAX + PTRATIO + B + LSTAT', data=train).fit() print(result.summary()) #오차의 합들이 가장 작아지는 것이 무엇인지 찾아 최적의 변숫값을 찾는것 for i, row in test.iterrows(): params = result.params r_estimate = row['PTRATIO']*params['PTRATIO'] + row['NOX']*params['NOX'] + row['B']*params['B'] + \ row['CHAS']*params['CHAS'] + row['RAD']*params['RAD'] + row['TAX']*params['TAX'] + row['ZN']*params['ZN'] + \ row['DIS']*params['DIS'] + row['CRIM']*params['CRIM'] + row['RM']*params['RM'] + \ row['LSTAT']*params['LSTAT'] + params['Intercept'] difference = abs(row['target'] - estimate) sum_difference += difference print(difference)
#Some cool Visualizations plt.scatter(Vir_runs, Vir_balls, color='red', alpha=0.5, s=Vir_fours * 100, facecolor="white") plt.scatter(Vir_runs, Vir_min, color='red', alpha=0.5, s=Vir_balls * 10, facecolor="white") plt.scatter(Vir_runs, Vir_balls, color='red', alpha=0.5, s=Vir_sr * 10, facecolor="white") #Dependency of runs on some factors import statsmodels.formula.api as smf est = smf.gls(formula='Vir_runs ~ Vir_balls + Vir_fours + Vir_sixes + Vir_sr', data=Virat).fit() est.summary() est = smf.gls( formula='Vir_runs ~ Vir_balls + Vir_fours + Vir_sixes + Opposition', data=Virat).fit() est.summary()
test_variance7 = round( np.power(test7_df['salary'].corr(test7_df['predicted_salary']), 2), 3) print('Test Set Variance Accounted for: ', test_variance7) fit7 = statsform.wls(model7, data=train7_df, weights=1. / (w**2)).fit() print(fit7.summary()) ## Model 8 ## Model 6 using GLS test8_df = test_df_nooutlines.copy() train8_df = train_df_nooutlines.copy() model8 = str('salary ~ conference + wl_ratio + capacity') train8_fit = statsform.gls(model8, data=train8_df).fit() train8_df['predicted_salary'] = train8_fit.fittedvalues test8_df['predicted_salary'] = train8_fit.predict(test8_df) test_variance8 = round( np.power(test8_df['salary'].corr(test8_df['predicted_salary']), 2), 3) print('Test Set Variance Accounted for: ', test_variance8) fit8 = statsform.gls(model8, data=train8_df).fit() print(fit8.summary()) ## Setting some base variables so I can easily change my inputs from prediction to prediction year = '2017' school = 'Syracuse' coach = 'Dino Babers' conference = 'ACC'
"max_depth": [25, 50, 100, None], "n_estimators": [100, 500, 1000], "criterion": ["mse"] } from sklearn.model_selection import GridSearchCV # random_search = GridSearchCV(model, param_grid =param_dist, cv=2) # print('start') # random_search.fit(X=temp[pred_id],y=temp['actual']) # random_search.cv_results_ # random_search.best_score_ # random_search.best_params_ # end hyper gls(data=temp, formula='target~consensus_std+actual_L1+quarterly_ret').fit().summary() model.fit(X=temp[pred_id], y=temp['target']) feat_imp = pd.DataFrame(data=model.feature_importances_, index=pred_id) print(feat_imp.sort_values(0, ascending=False)) test_s['over_hat'] = model.predict(X=test_s[pred_id]) test_s['brut'] = 0 t = (test_s['brut'] == test_s['target']) sum(t) / len(t) test_s['model'] = test_s['consensus_mean'] test_s.loc[test_s.over_hat == 1, 'model'] = test_s.loc[test_s.over_hat == 1, 'model'] * 1.1 # test_s.loc[test_s.over_hat==0,'model'] = test_s.loc[test_s.over_hat==0,'model'] *0.9
def add_covariate_model(self, label, covariate, model, restriction=None, recode=None, var_type='binary', print_results=True): """ Build the model for the specified covariate. This is to deal with time-varying confounders. Does NOT have to be specified, unlike the exposure and outcome models. The order in which these models are fit is based on the provided integer labels Input: label: -integer label for the covariate model. Covariate models are fit in ascending order within TimeVaryGFormula covariate: -variable to be predicted model: -variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format is the same as the functional form, i.e. 'var1 + var2 + var3 + var4' restriction: -used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g' Example) "g['art']==1" recode: -This variable is vitally important for various functional forms implemented later in models. This is used to run some background code to recreate functional forms as the g-formula is fit via fit() For an example, let's say we have age but we want the functional form to be cubic. For this, we would set the recode="g['']" Similar to TimeFixedGFormula, 'g' must be specified as the data frame object with the corresponding indexes. Also lines of executable code should end with ';', so Python knows that the line ends there. My apologies for this poor solution... I am working on a better way var_type: -type of variable that the covariate is. Current options include 'binary' or 'continuous' print_results: -whether to print the logistic regression results to the terminal. Default is True """ if type(label) is not int: raise ValueError('Label must be an integer') # Building predictive model g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() if self._weights is None: # Unweighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial(sm.families.links.logit) m = smf.glm(covariate + ' ~ ' + model, g, family=linkdist) elif var_type == 'continuous': linkdist = sm.families.family.Gaussian( sm.families.links.identity) m = smf.gls(covariate + ' ~ ' + model, g) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) else: # Weighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial(sm.families.links.logit) m = smf.gee(covariate + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist) elif var_type == 'continuous': linkdist = sm.families.family.Gaussian( sm.families.links.identity) m = smf.gee(covariate + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) f = m.fit() if print_results: print(f.summary()) # Adding to lists, it is used to predict variables later on for the time-varying... self._covariate_models.append(f) self._covariate_model_index.append(label) self._covariate.append(covariate) self._covariate_type.append(var_type) if recode is None: self._covariate_recode.append( 'None') # Must be string for exec() to use later else: self._covariate_recode.append(recode)