def get_model(self, df, formula='np.log(pris) ~ Kmstand'): #if len(self.df.index) >= 30: f = formula self.df = df levels = list(range(0, len(df.name.unique()))) contrast = Treatment(reference=0).code_without_intercept(levels) model = sm.formula.ols(f, data=self.df, missing='drop').fit() return model
import pandas as pd from statsmodels.formula.api import ols from matplotlib import pyplot as plt # LINEAR REGRESSION USING CATEGORICAL VARIABLES from patsy.contrasts import Treatment d = pd.read_csv('linearRegression.csv', sep=',') print(d.shape) print(d.head(10)) ########################################### # this part is only for our observation is not needed for modeling: # treatment contrasts: k categories are coded with k-1 levels! levels = [1, 2, 3, 4] contrast = Treatment(reference=1).code_without_intercept( levels) # reference=0: use the first level as reference. print(contrast.matrix) print(contrast.matrix[d.race - 1, :] [0:20]) # it starts the levels from zero! the reason for subtractions! ############################################ # Fitting the model: # We make treatment contrast for race!! model = ols("write~ + C(race, Treatment) + read + math + science", data=d) rs = model.fit() print(rs.summary()) print('model predictions: ') # print rs.predict(d) plt.plot(d.write, rs.predict(d), 'ro') plt.plot(d.write, d.write, 'r-', color='blue') plt.xlabel('Prediction') plt.ylabel("Write")
def code_without_intercept(self, levels): return Treatment(reference=0).code_without_intercept(levels)
# write, for each level of race ((1 = Hispanic, 2 = Asian, 3 = African # American and 4 = Caucasian)). hsb2.groupby('race')['write'].mean() # #### Treatment (Dummy) Coding # Dummy coding is likely the most well known coding scheme. It compares # each level of the categorical variable to a base reference level. The base # reference level is the value of the intercept. It is the default contrast # in Patsy for unordered categorical factors. The Treatment contrast matrix # for race would be from patsy.contrasts import Treatment levels = [1, 2, 3, 4] contrast = Treatment(reference=0).code_without_intercept(levels) print(contrast.matrix) # Here we used `reference=0`, which implies that the first level, # Hispanic, is the reference category against which the other level effects # are measured. As mentioned above, the columns do not sum to zero and are # thus not independent of the intercept. To be explicit, let's look at how # this would encode the `race` variable. hsb2.race.head(10) print(contrast.matrix[hsb2.race - 1, :][:20]) sm.categorical(hsb2.race.values) # This is a bit of a trick, as the `race` category conveniently maps to
fico[np.isnan(fico)] = 0 loansData['log_income'] = np.log1p(loansData['Monthly.Income']) ownership_dummies = pd.get_dummies(loansData['Home.Ownership'], prefix='ownership').iloc[:, 1:] # concatenate the dummy variable colums onto the original DataFrame (axis) data = pd.concat([loansData, ownership_dummies], axis=1) data.rename(columns={'Interest.Rate': 'Interest_Rate'}, inplace=True) # just getting rid of some stupid errors est = smf.ols( formula= "Interest_Rate ~ log_income + ownership_NONE + ownership_OTHER +ownership_OWN + ownership_RENT", data=data).fit() est.summary() ################################################################# loansData_ = loansData levels = ['NONE', 'OTHER', 'RENT', 'OWN', 'MORTGAGE'] ownership_dummies1 = Treatment(reference=0).code_without_intercept(levels) #ownership_dummies1.matrix[loansData_.house_ownership-1, :] loansData_.rename(columns={'Interest.Rate': 'Interest_Rate'}, inplace=True) mod = smf.ols("Interest_Rate ~ C(log_income, Treatment)", data=loansData_).fit() mod.summary()
def contrasting(): global c if c: #to account for multiple contrast variables contrastvars = [] if "," in c: contrastvars = c.split(",") for i in range(len(contrastvars)): contrastvars[i] = contrastvars[i].strip() if " " in contrastvars[i]: contrastvars[i] = contrastvars[i].replace(" ", "_") if "/" in contrastvars[i]: #to account for URLs splitted = contrastvars[i].split("/") contrastvars[i] = splitted[len(splitted) - 1] else: splitted = c.split("/") #to account for URLs c = splitted[len(splitted) - 1] ind_vars_no_contrast_var = '' index = 1 for i in range(len(full_model_variable_list)): if "/" in full_model_variable_list[i]: splitted = full_model_variable_list[i].split("/") full_model_variable_list[i] = splitted[len(splitted) - 1] if " " in full_model_variable_list[i]: full_model_variable_list[i] = full_model_variable_list[ i].replace(" ", "_") for var in full_model_variable_list: if var != c and not (var in contrastvars): if index == 1: ind_vars_no_contrast_var = var index += 1 else: ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var if len(contrastvars) > 0: contraststring = ' + '.join(contrastvars) else: if " " in c: c = c.replace(" ", "_") contraststring = c # With contrast (treatment coding) print( "\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept." ) ctrst = Treatment(reference=0).code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Treatment)", data=df_final) res = mod.fit() print("With contrast (treatment coding)") print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write("\n" + full_model) f.write( "\n\n***********************************************************************************************************" ) f.write( "\n\n\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept." ) f.write("With contrast (treatment coding)") f.write(res.summary().as_text()) f.close() # Defining the Simple class def _name_levels(prefix, levels): return ["[%s%s]" % (prefix, level) for level in levels] class Simple(object): def _simple_contrast(self, levels): nlevels = len(levels) contr = -1. / nlevels * np.ones((nlevels, nlevels - 1)) contr[1:][np.diag_indices(nlevels - 1)] = (nlevels - 1.) / nlevels return contr def code_with_intercept(self, levels): c = np.column_stack( (np.ones(len(levels)), self._simple_contrast(levels))) return ContrastMatrix(c, _name_levels("Simp.", levels)) def code_without_intercept(self, levels): c = self._simple_contrast(levels) return ContrastMatrix(c, _name_levels("Simp.", levels[:-1])) ctrst = Simple().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Simple)", data=df_final) res = mod.fit() print( "\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors." ) f.write(res.summary().as_text()) f.close() #With contrast (sum/deviation coding) ctrst = Sum().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Sum)", data=df_final) res = mod.fit() print( "\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels." ) f.write(res.summary().as_text()) f.close() #With contrast (backward difference coding) ctrst = Diff().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Diff)", data=df_final) res = mod.fit() print( "\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level." ) f.write(res.summary().as_text()) f.close() #With contrast (Helmert coding) ctrst = Helmert().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Helmert)", data=df_final) res = mod.fit() print( "\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding." ) f.write(res.summary().as_text()) f.close()