def regression(self): print self.people.head(n=1) self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class" self.logfile.write( "\n\n Sum Temp Interest NegBinom") m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC"+str(+m.aic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) self.logfile.write( "\n\n Sum Temp Interest OLS") m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC"+str(+m.aic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) self.logfile.write( "\n\n Pos Temp Interest NegBinom") m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) #lim_people = self.people[self.people.timePosInterest>0] self.logfile.write( "\n\n Pos Temp Interest OLS") m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular())
def regression(self): print self.people.head(n=1) self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class" self.logfile.write( "\n\n Num Regions NegativeBinomial") m = glm("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) #lim_people = self.people[self.people.numRegions>0] self.logfile.write( "\n\n Num Regions OLS") m = ols("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century data=self.people).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) # we could use beta regression for normalized entropy #print "\n\n Region Entropy" #m = ols("entropy ~ C(gender,Treatment(reference='male')) ", #+ C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century # data=self.people).fit() #print m.summary() # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on self.logfile.write( "\n\n Sum Temp Interest") m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC"+str(+m.aic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) self.logfile.write( "\n\n Pos Temp Interest") m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) #lim_people = self.people[self.people.timePosInterest>0] self.logfile.write( "\n\n Pos Temp Interest OLS") m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular())
def generate_regression_models(df): # Using glm function in statsmodels.formula.api class to create regression models heart_deaths = sm.glm(formula="Heart_Disease_Deaths ~ Obesity + Binge_Drinking + Smoking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit() cancer_deaths = sm.glm(formula="Cancer_Deaths ~ Obesity + Binge_Drinking + Smoking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit() diabetes_deaths = sm.glm(formula="Diabetes_Deaths ~ Obesity + Smoking + Binge_Drinking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit() resp_deaths = sm.glm(formula="Respiratory_Disease_Deaths ~ Obesity + Smoking + Binge_Drinking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit() # Appending the different models to a list models = [] models.append(heart_deaths) models.append(cancer_deaths) models.append(resp_deaths) models.append(diabetes_deaths) return models
def estimate_latency(group): ''' Use a linear regression to estimate time per work from a Pandas GroupBy. ''' model = smf.glm( formula='Time ~ Work', data=group, ).fit() # pack up information about parameter estimates # so they can be programatically unpacked later decoder = { 'Intercept': 'Overhead', 'Work': 'Latency', } for spec, val in (({ 'statistic': statistic, 'name': decoder[parameter], }, getter(parameter)) for parameter in model.params.keys() for statistic, getter in ( ('Estimated', lambda param: model.params[parameter]), ('Lower Bound', lambda param: model.conf_int()[0][param]), ('Upper Bound', lambda param: model.conf_int()[1][param]), )): group['{statistic} {name}'.format(**spec)] = val return group
def test_formula_missing_exposure(): # see 2083 import statsmodels.formula.api as smf import pandas as pd d = { 'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan], 'constant': [1] * 4, 'exposure': np.random.uniform(size=4), 'x': [1, 3, 2, 1.5] } df = pd.DataFrame(d) family = sm.families.Gaussian(link=sm.families.links.log) mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure, family=family) assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray') exposure = pd.Series(np.random.uniform(size=5)) assert_raises(ValueError, smf.glm, "Foo ~ Bar", data=df, exposure=exposure, family=family) assert_raises(ValueError, GLM, df.Foo, df[['constant', 'Bar']], exposure=exposure, family=family)
def __stats_method(n1j, ni1, n11, family): ''' If the expected counts are calculated via a statistical model, this function will do so. Expected counts are considered a function of n1j and ni1. Arguments: n1j (iterable): All adverse events for a single product ni1 (iterable): Total count of a particular AE across all products n11 (iterable): Total count of a particular AE for a particular product family (statsmodel family): The GLM family Returns: The expected counts for n11 ''' data = pd.DataFrame({'events': n11, 'prod_events': n1j, 'ae_events': ni1}) model = smf.glm(formula='events ~ prod_events+ae_events', data=data, family=family) model = model.fit() if isinstance(family, sm.families.Poisson): dispersion = model.pearson_chi2 / model.df_resid if dispersion > 2: alpha, lb, ub = __test_dispersion(model, data) warnings.warn( """Variance does not equal the mean! Data likely overdispersed...\n Consider utilizing the negative-binomial family instead of poisson.\n Cameron-Trivedi alpha: {0:5.4f}, CI: ({1}, {2})""". format(alpha, lb, ub)) return model.predict(data[['prod_events', 'ae_events']]).values
def compute_pse(df, formula='resp ~ morph + location - 1'): """ Compute Point of Subjective Equality based on responses in df. This is done by fitting a logit model on the response, and finding the point of inflection. Parameters ---------- df : pd.DataFrame each row is a trial formula : str formula passed to the glm for fitting Returns ------- pse : array (n_locations, ) the pse estimates for each location """ model = smf.glm(formula, df, family=sm.families.Binomial()) try: res = model.fit() # now return the estimates p = res.params.as_matrix() pse = -p[:-1] / p[-1] except PerfectSeparationError: print("WARNING: got PerfectSeparationError, filling pses with 0.5") pse = np.ones(len(df.location.unique())) * 0.5 return pse
def fit_with_logistic(self, threshold=0.5): formula = "%s~%s" % (self.y_col, "+".join(self.x_cols)) model = smf.glm(formula, data=self.train_set, family=sm.families.Binomial()) result = model.fit() predict_probs = result.predict(exog=self.test_set) real_values = self.test_set[self.y_col].map(lambda x: 1 if x == 'No' else 0) tp.output_table_with_prob(predict_probs, real_values, threshold=threshold, zero_one_col_texts=["Yes", "No"])
def outcome_model(self, model, print_model_results=True): """Used to specify the outcome model. Model used to predict the outcome via a logistic regression model model: -Independent variables to predict the outcome. Example) 'var1 + var2 + var3 + var4' print_model_results: -Whether to print the fitted model results. Default is True (prints results) """ self._out_model = self._outcome + ' ~ ' + model f = sm.families.family.Binomial(sm.families.links.logit) log = smf.glm(self._out_model, self.df, family=f).fit() if print_model_results: print( '\n----------------------------------------------------------------' ) print('MODEL: ' + self._out_model) print( '-----------------------------------------------------------------' ) print(log.summary()) dfx = self.df.copy() dfx[self._exposure] = 1 self.df['pY1'] = log.predict(dfx) dfx = self.df.copy() dfx[self._exposure] = 0 self.df['pY0'] = log.predict(dfx) self._fit_outcome_model = True
def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
def fit_model(self): """ Fits Poisson model Returns ------- p_val p-values for differential abundance test of all cell types """ p_val = [] K = self.y.shape[1] if self.y.shape[0] == 2: p_val = [0 for _ in range(K)] else: for k in range(K): data_ct = pd.DataFrame({"x": self.x[:, 0], "y": self.y[:, k]}) model_ct = glm('y ~ x', data=data_ct, family=sm.genmod.families.Poisson(), offset=np.log(self.n_total)).fit() p_val.append(model_ct.pvalues[1]) self.p_val = p_val
def logistic_regression(self, use_glm=True): """ (b) it seems the statistical significant predict variable is only Lag2. How disappointing... """ formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume" model = ( smf.glm(formula, data=self.df, family=sm.families.Binomial()) if use_glm else smf.logit(formula, data=self.transformedDF) ) result = model.fit() if use_glm: probs = result.fittedvalues """Beware the prob here is the index 0's prob, so we should use the lambda function below""" pred_values = probs.map(lambda x: 0 if x > 0.5 else 1) else: """The probability of being 1""" probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]]))) pred_values = probs.map(lambda x: 1 if x > 0.5 else 0) """ (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%. Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%. Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%. """ tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
def multiple_linear_regression(): '''Multiple linear regression chapter 6.3, p. 98''' # get the data from the web inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls' df = get_data(inFile) # do the fit, for the original model ... model = ols('carbohydrate ~ age + weight + protein', data=df).fit() print model.summary() print anova_lm(model) # as GLM glm = glm('carbohydrate ~ age + weight + protein', family=Gaussian(), data=df).fit() print 'Same model, calculated with GLM' ''' The confidence intervals are different than those from OLS. The reason (from Nathaniel Smith): OLS uses a method that gives exact results, but only works in the special case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM instead uses an approximate method which is correct asymptotically but may be off for small samples; the tradeoff you get in return is that this method works the same way for all GLM models, including those with non-Gaussian error terms and non-trivial link functions. So that's why they're different. ''' print glm.summary() # ... and for model 1 model1 = ols('carbohydrate ~ weight + protein', data=df).fit() print model1.summary() print anova_lm(model1)
def fit_model(self,df, filters, model_expression): """ Use statsmodels GLM to construct a model relation. Parameters ---------- df : pandas.DataFrame Data to use for fit. Should contain all the columns referenced in the `model_expression`. filters : list of str Any filters to apply before doing the model fit. model_expression : str A patsy model expression that can be used with statsmodels. Should contain both the left- and right-hand sides. Returns ------- fit : statsmodels.regression.linear_model.GLMSResults """ df = util.apply_filter_query(df, filters) model=smf.glm(formula=model_expression, data=df, family=sm.families.Poisson()) if len(model.exog) != len(df): raise ModelEvaluationError( 'Estimated data does not have the same length as input. ' 'This suggests there are null values in one or more of ' 'the input columns.') with log_start_finish('statsmodels GLM fit', logger): return model.fit()
def test3(self): results = pd.read_csv("../Data/results.csv") results = results[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']] results = results.rename(columns={ 'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals' }) results_data = pd.concat([ results[['HomeTeam', 'AwayTeam', 'HomeGoals']].assign(home=1).rename(columns={ 'HomeTeam': 'team', 'AwayTeam': 'opponent', 'HomeGoals': 'goals' }), results[['AwayTeam', 'HomeTeam', 'AwayGoals']].assign(home=0).rename(columns={ 'AwayTeam': 'team', 'HomeTeam': 'opponent', 'AwayGoals': 'goals' }) ]) poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=results_data, family=sm.families.Poisson()).fit() lfc_lei = simulate_match(poisson_model, "Liverpool", "Leicester") homewin = np.sum(np.tril(lfc_lei, -1)) homeloss = np.sum(np.triu(lfc_lei, 1)) homedraw = np.sum(np.diag(lfc_lei)) total = np.ceil(homeloss + homewin + homedraw) self.assertNotEqual(0.5, total)
def outcome_model(self, model, print_results=True): """Build the model for the outcome. This is also referred to at the Q-model. This must be specified before the fit function. If it is not, an error will be raised. model: -variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Model form should contain the exposure. Format is the same as the functional form, i.e. 'var1 + var2 + var3 + var4' print_results: -whether to print the logistic regression results to the terminal. Default is True """ if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial(sm.families.links.logit) else: linkdist = sm.families.family.Gaussian(sm.families.links.identity) # Modeling the outcome if self._weights is None: m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist) self.outcome_model = m.fit() else: m = smf.gee(self.outcome + ' ~ ' + model, self.gf.index, self.gf, family=linkdist, weights=self.gf[self._weights]) self.outcome_model = m.fit() # Printing results of the model and if any observations were dropped if print_results is True: print(self.outcome_model.summary()) self.model_fit = True
def fit_model(self): """ Fits CLR model with linear model Returns ------- p_val p-values for differential abundance test of all cell types """ p_val = [] K = self.y.shape[1] if self.y.shape[0] == 2: p_val = [0 for _ in range(K)] else: # computes clr-transformed data matrix as a pandas DataFrame geom_mean = np.prod(self.y, axis=1, keepdims=True) ** (1 / K) y_clr = np.log(self.y / geom_mean) for k in range(K): data_ct = pd.DataFrame({"x": self.x[:, 0], "y": y_clr[:, k]}) model_ct = glm('y ~ x', data=data_ct).fit() p_val.append(model_ct.pvalues[1]) self.p_val = p_val
def mylogistic(_x, _y): x = _x.copy() y = _y.copy() r, c = x.shape beta = np.zeros((c, 1)) epsilon = 1e-6 while True: eta = np.dot(x, beta) pr = exp_it(eta) w = pr * (1 - pr) z = eta + (y - pr) / w sw = np.sqrt(w) mw = np.repeat(sw, c, axis=1) x_work = mw * x y_work = sw * z beta_new, _, _, _ = np.linalg.lstsq(x_work, y_work) err = np.sum(np.abs(beta_new - beta)) beta = beta_new if err < epsilon: break model = smf.glm('admit ~ gre + gpa + rank', df, family=sm.families.Binomial()).fit() print model.summary() return model
def get_best_model(train, test, city): if city == 'sj': # Step 1: specify the form of the model model_formula = "total_cases ~ 1 + " \ "reanalysis_specific_humidity_g_per_kg + " \ "reanalysis_dew_point_temp_k + " \ "station_min_temp_c + " \ "station_avg_temp_c + " \ "reanalysis_relative_humidity_percent " #"reanalysis_min_air_temp_k + " \ elif city == 'iq': model_formula = "total_cases ~ 1 + " \ "reanalysis_specific_humidity_g_per_kg + " \ "reanalysis_dew_point_temp_k + " \ "station_min_temp_c + " \ "station_avg_temp_c + " \ "reanalysis_min_air_temp_k " grid = 10**np.arange(-10, -3, dtype=np.float64) best_alpha = [] best_score = 1000 # Step 2: Find the best hyper parameter, alpha for alpha in grid: model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha)) results = model.fit() predictions = results.predict(test).astype(int) score = eval_measures.meanabs(predictions, test.total_cases) if score < best_score: best_alpha = alpha best_score = score #print('best alpha = ', best_alpha) #print('best score = ', best_score) # Step 3: refit on entire dataset full_dataset = pd.concat([train, test]) model = smf.glm(formula=model_formula, data=full_dataset, family=sm.families.NegativeBinomial(alpha=best_alpha)) fitted_model = model.fit() return fitted_model
def computeEstabilityTest(df, yv): g = smf.glm(formula=df.columns[yv] + "~1", data=df, family=sm.families.Poisson()).fit() process = numpy.asarray(g.resid_response) k = 1 n = len(process) process = process / numpy.sqrt(n) meat = numpy.inner(process, process) J12 = numpy.sqrt(1 / meat) process = J12 * process #print(sum(abs(process))) from_ = numpy.ceil(n * 0.1) from_ = int(max(from_, 10)) to = int(n - from_) lambda_ = ((n - from_) * to) / (from_ * (n - to)) tt = (numpy.arange(from_, to + 1)) / n ttt = (tt * (1.0 - tt)) pvals = numpy.zeros((df.shape[1])) pvals[yv] = numpy.NaN if from_ >= to: return pvals for zv in range(df.shape[1]): if zv == yv: continue zi = df[df.columns[zv]] oi = numpy.argsort(zi, kind="mergesort") proci = process[oi] proci = numpy.cumsum(proci) xx = proci**2 xx = xx[from_ - 1:to] stati = numpy.max(xx / ttt) #print(stati, k, lambda_) pvals[zv] = supLM(stati, k, lambda_) #print(pvals[zv]) #print(pvals) return numpy.exp(pvals)
def RegressionModel(self): #poisson regression model = smf.glm(formula = "num_pickups ~ year + month + lat +long +dayofweek +day+quarter", data=self.dftaxi, family=sm.families.Poisson()).fit() print("Poisson Model Summary") print(model.summary()) print("\n") #RMSE print("RMSE for Poisson Regression Model : ",sm.tools.eval_measures.rmse(self.dftaxi.num_pickups, model.fittedvalues, axis=0)) print("-------------------------------------------------") #negative binomial regression model = smf.glm(formula = "num_pickups ~ year + month + lat +long +dayofweek +day+quarter", data=self.dftaxi, family=sm.families.NegativeBinomial()).fit() print("Negative Binomial Model Summary") print(model.summary()) print("\n") #RMSE print("RMSE for Negative Binomial Regression Model : ",sm.tools.eval_measures.rmse(self.dftaxi.num_pickups, model.fittedvalues, axis=0)) print("-------------------------------------------------")
def EstimacionMVPromGolesLV(df_cal, ids_torneo): df_reg = ReshapeDataFramePromGolesLV(df_cal) formula, constraints = FormulaPromGolesLV(df_reg.columns.tolist()) model = glm(formula, groups=None, data=df_reg, family=Poisson()).fit_constrained(constraints) dictparams = OutputPoissReg(model, ['pgfl', 'pgfv', 'pgal', 'pgav'], ids_torneo) return dictparams
def logistic_regression(): '''Logistic regression example chapter 7.3, p 130 [tbd]: the cloglog values are inconsistent with those mentioned in the book. This is probably due to the specific definitions of "loglog" and "cloglog" in the respective languages. ''' inFile = r'GLM_data/Table 7.2 Beetle mortality.xls' df = get_data(inFile) # adjust the unusual column names in the Excel file colNames = [name.split(',')[1].lstrip() for name in df.columns.values] df.columns = colNames # fit the model df['tested'] = df['n'] df['killed'] = df['y'] df['survived'] = df['tested'] - df['killed'] model = glm('survived + killed ~ x', data=df, family=Binomial()).fit() print model.summary() print '-'*65 print 'Equivalent solution:' model = glm('I(n - y) + y ~ x', data=df, family=Binomial()).fit() print model.summary() # The fitted number of survivors can be obtained by fits = df['n']*(1-model.fittedvalues) print 'Fits Logit:' print fits # The fits for other link functions are: model_probit = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.probit)).fit() print model_probit.summary() fits_probit = df['n']*(1-model_probit.fittedvalues) print 'Fits Probit:' print fits_probit model_cll = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.cloglog)).fit() print model_cll.summary() fits_cll = df['n']*(1-model_cll.fittedvalues) print 'Fits Extreme Value:' print fits_cll
def Backward(self, odject_inputBackward, data_inputBackward, metricBackward = "aic"): import statsmodels.api as sm import statsmodels.formula.api as smf model_formula_full = odject_inputBackward.formula Xs_full = model_formula_full.split("~")[1].split("+") Xs_optimum = Xs_full if( metricBackward == "aic"): metric_optimum = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_optimum), data = data_inputBackward, family=sm.families.Binomial()).fit().aic else: metric_optimum = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_optimum), data = data_inputBackward, family=sm.families.Binomial()).fit().bic for Xs_full_i in Xs_full: Xs_temp = Xs_optimum[:] Xs_temp.remove(Xs_full_i) if( metricBackward == "aic"): metric_temp = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_temp), data = data_inputBackward, family=sm.families.Binomial()).fit().aic else : metric_temp = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_temp), data = data_inputBackward, family=sm.families.Binomial()).fit().bic if (metric_temp < metric_optimum): metric_optimum = metric_temp Xs_optimum = Xs_temp return model_formula_full.split("~")[0]+"~"+"+".join(Xs_optimum)
def test_umap_one(): print('started') df = pd.read_csv(sys.argv[1], dtype={'location':str, 'Result':str}) df=df.drop(df[df.BIRTH_DATETIME=='0'].index) phecodes = pd.read_csv(sys.argv[2], dtype=str) out = sys.argv[3] phe_list=[phe for phe in list(phecodes.PHECODE.unique()) if phe in df] phedf = df.loc[:, phe_list] phedf[phedf>0] = 1 df[phe_list] = phedf print('loaded') #Create embeddings pca = PCA(n_components=50, random_state=42) pc_emb = pca.fit_transform(phedf) ump = umap.UMAP(metric='euclidean', n_components=10, random_state=42) ump_emb = ump.fit_transform(pc_emb) print('embedded') #create df reduced_df = pd.DataFrame(ump_emb, columns = ['UMP-'+str(i+1) for i in range(10)]) reduced_df['CC_STATUS']=df['CC_STATUS'] #Create visualization sns.set() sns.pairplot(reduced_df, hue="CC_STATUS", vars=['UMP-'+str(i+1) for i in range(10)], height=4, markers=['o', 's'], plot_kws=dict(alpha=0.1)) plt.savefig(out) print('graphed') #test components reduced_df['newcc']=0 reduced_df.loc[reduced_df['UMP-2']<-12, 'newcc']=1 df['newcc']=reduced_df['newcc'] print('opening file') out_file = open('files/umap_new_cases_chi_phecode_test_2.csv', 'w') out_file.write('phecode,chi2,p,dof,control_neg,case_neg,control_pos,case_pos\n') #Run univariate tests using this newcc col for phecode in phe_list: #Get count of people positive for this phecode in case case_pos = df.loc[(df.newcc==1) & (df[phecode]==1)].shape[0] #Get negative count in case case_neg = df.loc[(df.newcc==1) & (df[phecode]==0)].shape[0] #Get positive control control_pos = df.loc[(df.newcc==0) & (df[phecode]==1)].shape[0] #Get negative control control_neg = df.loc[(df.newcc==0) & (df[phecode]==0)].shape[0] #Run contingency test if case_pos>0 and case_neg>0 and control_pos>0 and control_neg>0: res=chi2_c([[control_neg, case_neg],[control_pos, case_pos]]) #Write results out_file.write(','.join([phecode,str(res[0]),str(res[1]),str(res[2]),str(control_neg),str(case_neg),str(control_pos),str(case_pos)])) out_file.write('\n') out_file.close() print('ran phecode tests') #Get age df['AGE']= pd.to_datetime(df['BIRTH_DATETIME'].str[:10], format='%Y-%m-%d') df['AGE']=(datetime.datetime.now()-df['AGE']).astype('timedelta64[Y]') #Run same test procedure for covariates, but do regression (?) print('running regression') mod = smf.glm(formula='newcc ~ AGE + UNIQUE_PHECODES + RACE + GENDER + RECORD_LENGTH_DAYS', data=df, family=fam.Binomial()) res = mod.fit() print(res.summary())
def pred(working, rating): data = working[working['prosper_rating']==rating] #https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test #60%, 20% 20% for traing, test and validation train, validation, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))]) print("total:{} train:{} test:{} validation:{}".format(len(data), len(train), len(validation), len(test))) mod = smf.glm('status ~ borrower_rate', data=train, family=sm.families.Binomial()).fit() print(test_model(mod, test))
def test_sse(self): """Check sum of squared error vs statsmodels (doesn't apply to logistic regression)""" g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal') mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df) modfitted = mod.fit() pred = modfitted.predict() diff = pred - np.asarray(self.df.ycts) chk1 = abs(np.multiply(diff, diff).sum() - g.sse) self.assertAlmostEqual(chk1, 0, 5, 'glm: sse calculation')
def covariate_analysis(): cc_df = pd.read_csv(sys.argv[1]) cc_df = cc_df.drop(cc_df[cc_df.BIRTH_DATETIME=='0'].index) #Compare sex, age, ethnicity, record_length, and most recent event #Get age cc_df['age'] = datetime.datetime.now() - cc_df["BIRTH_DATETIME"].str[:10].apply(dconvert) cc_df['age'] = cc_df['age'].apply(ddays) #Between Case and Control status all_res = smf.glm(formula="CC_STATUS ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=cc_df, family=fam.Binomial()).fit() print("Results for Case/control data:") print(all_res.summary()) norm_df = cc_df.loc[cc_df.CC_STATUS==1] print(cc_df.shape) print(norm_df.shape) norm_df['normality_status'] = norm_df["Result"].apply(binarize_normal) normality_res = smf.glm(formula="normality_status ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=norm_df, family=fam.Binomial()).fit() print("Results for normal/abnormal data:") print(normality_res.summary())
def __init__(self, ind_data, index, use_rank=True, formula = None, method = 'pearson', ind_rank_method = rankdata, s_transform = None,t_transform = None,e_transform = None, s_bounds = None, t_bounds = None, verbose = False,return_term_array = 0,zdist = False,ndmetric = 'correlation'): """ for right now, it's up to you to make sure that the index matches your ind_data, for a glm that means you need a separate index for each subject, for an LMER it'll be 1 big block of independent data and 1 big index. ind_rank_method lets you define a function for ranking the independent data this lets us deal with ranking and sorting and interaction terms and all of that""" Measure.__init__(self) self._ind_data = ind_data[index] self._index = index self._use_rank = use_rank self._formula = formula self._method = method self._ndmetric = ndmetric self._subj = np.unique(self._ind_data['subject']) self._is1=True self._s_transform = s_transform self._t_transform = t_transform self._e_transform = e_transform self.verbose = verbose self._return_term_array = return_term_array self._zdist = zdist if s_bounds != None: self._s_bounds = s_bounds if t_bounds != None: self._t_bounds = t_bounds if s_transform is not None: self._ind_data['space'] = s_transform(self._ind_data['space']) if t_transform is not None: self._ind_data['space'] = t_transform(self._ind_data['time']) if e_transform is not None: self._ind_data['space'] = e_transform(self._ind_data['event']) if self._use_rank == True: # rank the ind data self._ind_data = ind_rank_method(self._ind_data) else: idat_df = pd.DataFrame(self._ind_data) idat_df['val'] = np.zeros(idat_df.shape[0]) self._ind_data = idat_df.to_records() #figure out how long the results array will be by calling the glm on some dummy data if self._method=='glm': self._ind_data['val'] = np.random.randn(self._ind_data.shape[0]) self._res_len = (smf.glm(formula=self._formula, data=self._ind_data).fit().params.shape[0]*2)+4 #set val back to zeros just in case self._ind_data['val'] = np.zeros(self._ind_data.shape[0])
def scale(self, vars_to_regress=None, model_type="none", do_trim=False, do_scale=True, do_center=True, scale_max=10): """ Regress out reads per cell and identity """ scaled = np.zeros((self._ncell, self._ngene)) reads_per_cell = self._meta["reads_per_cell"] genes_per_cell = self._meta["genes_per_cell"] ident = self._meta["orig_ident"] group = self._meta["group"] if model_type is "none": scaled = self._data.values.copy() else: for i in range(self._ngene): expr = self._data.iloc[:, i] d = pd.DataFrame(np.array( (expr.astype(np.float), reads_per_cell, genes_per_cell, ident, group)).T, columns=[ "expr", "reads_per_cell", "genes_per_cell", "orig_ident", "group" ]) if model_type is "linear": results = smf.ols( 'expr ~ orig_ident + reads_per_cell + group', data=d).fit() scaled[:, i] = results.resid elif model_type is "poisson": results = smf.glm( 'expr ~ reads_per_cell + orig_ident + group', data=d, family=sm.families.Poisson()).fit() #results = smf.glm('expr ~ orig_ident', data=d,family=sm.families.NegativeBinomial()).fit() scaled[:, i] = results.resid_pearson self._scaled = pd.DataFrame(scaled, columns=self._data.columns, index=self._data.index) if do_trim: x = self._scaled.mean() y = self._scaled.var() / x plt.plot(x, y, '.') good_genes = np.array(np.logical_and(y.values > 1, x.values > 0.1)) self._scaled = self._scaled.iloc[:, good_genes] if do_center or do_scale: for i in range(self._scaled.shape[1]): temp = self._scaled.iloc[:, i].values temp = scale(temp, with_mean=do_center, with_std=do_scale) temp[temp > scale_max] = scale_max self._scaled.iloc[:, i] = temp
def report_glm(formula, data, verbose=True, **kwargs): """Fit GLM, print a report, and return the fit object.""" results = smf.glm(formula, data=data, **kwargs).fit(disp=False, **kwargs) summary = results.summary() if verbose: report = """\n{summary}\n""".format(summary=summary) print(report) return results
def fit(self, df): goal_model_data = pd.concat([ df[['home_team_name', 'away_team_name', 'home_team_goal_count' ]].assign(home=1).rename(columns=self.home_dict), df[['home_team_name', 'away_team_name', 'away_team_goal_count' ]].assign(home=0).rename(columns=self.away_dict) ]) self.model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, family=sm.families.Poisson()).fit()
def _update_model(self): """ Creates/updates time independent Poisson regression model based on actual goal data. :return: Returns fitted time independent poisson regression model. """ self.model = smf.glm(formula="goals ~ home + C(team) + C(opponent)", data=self.goal_data, family=sm.families.Poisson()).fit_regularized( L1_wt=0, alpha=0.01)
def TrainModel(self, data, args={}): self, options = UpdateOptions(self, args) self.NegativeBinomial = smf.glm( formula=self.extractFormula(data), data=data, family=sm.families.NegativeBinomial(alpha=self.alpha)) self.Results = self.NegativeBinomial.fit() return -1
def test_logit(self): from statsmodels.formula.api import glm from statsmodels.genmod.families import Binomial inData = logit.getData() dfFit = logit.prepareForFit(inData) model = glm('ok + failed ~ temp', data=dfFit, family=Binomial()).fit() logit.showResults(inData, model) self.assertAlmostEqual(model.params.Intercept, -15.042902, places=5)
def outcome_model(self, model, print_results=True): """Build the model for the outcome. This is also referred to at the Q-model. This must be specified before the fit function. If it is not, an error will be raised. Parameters ---------- model : str Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male' print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ if self.exposure not in model: warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial() elif self.outcome_type == 'normal': linkdist = sm.families.family.Gaussian() elif self.outcome_type == 'poisson': linkdist = sm.families.family.Poisson() else: raise ValueError( "Only 'binary', 'normal', and 'poisson' distributed outcomes are available" ) # Modeling the outcome if self.weight is None: m = smf.glm(self.outcome + ' ~ ' + model, self.sample, family=linkdist) self._outcome_model = m.fit() else: m = smf.glm(self.outcome + ' ~ ' + model, self.sample, family=linkdist, freq_weights=self.sample[self.weight]) self._outcome_model = m.fit() # Printing results of the model and if any observations were dropped if print_results: print(self._outcome_model.summary())
def _run_categorical(self, data, formula, formula_restricted) -> Dict: result = dict() # Regress both models est = smf.glm(formula, data=data, family=self.family).fit(use_t=self.use_t) est_restricted = smf.glm(formula_restricted, data=data, family=self.family).fit( use_t=True ) # Check convergence if est.converged & est_restricted.converged: result["Converged"] = True # Calculate Results lrdf = est_restricted.df_resid - est.df_resid lrstat = -2 * (est_restricted.llf - est.llf) lr_pvalue = scipy.stats.chi2.sf(lrstat, lrdf) result["LRT_pvalue"] = lr_pvalue result["pvalue"] = result["LRT_pvalue"] result["Diff_AIC"] = est.aic - est_restricted.aic return result
def test_logit(self): from statsmodels.formula.api import glm from statsmodels.genmod.families import Binomial inData = C13_2_logit.getData() dfFit = C13_2_logit.prepareForFit(inData) model = glm('ok + failed ~ temp', data=dfFit, family=Binomial()).fit() C13_2_logit.showResults(inData, model) self.assertAlmostEqual(model.params.Intercept, -15.042902, places=5)
def Poisson_model(dataset,home,away): goal_model_data = pd.concat([dataset[['HomeTeam', 'AwayTeam', 'HomeGoals']].assign(home=1).rename( columns={'HomeTeam': 'team', 'AwayTeam': 'opponent', 'HomeGoals': 'goals'}), dataset[['AwayTeam', 'HomeTeam', 'AwayGoals']].assign(home=0).rename( columns={'AwayTeam': 'team', 'HomeTeam': 'opponent', 'AwayGoals': 'goals'})]) poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, family=sm.families.Poisson()).fit() poisson_model.summary() return poisson_model
def outcome_model(self, model, restriction=None, print_results=True): """Add a specified regression model for the outcome. Must be specified before the fit function. Parameters ---------- model: Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format follows patsy standards For example) 'var1 + var2 + var3 + var4' restriction : str, optional Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1" print_results : bool, optional Whether to print the logistic regression model results to the terminal. Default is True """ g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() linkdist = sm.families.family.Binomial() if self._weights is None: # Unweighted g-formula if self._competing_event: self.out_model = sm.MNLogit.from_formula( self.outcome + ' ~ ' + model, g).fit() else: self.out_model = smf.glm(self.outcome + ' ~ ' + model, g, family=linkdist).fit() else: # Weighted g-formula if self._competing_event: raise ValueError( "The weighted MonteCarloGFormula is not supported for competing events" ) self.out_model = smf.glm(self.outcome + ' ~ ' + model, g, freq_weights=g[self._weights], family=linkdist).fit() if print_results: print(self.out_model.summary()) self._outcome_model_fit = True
def outcome_model(self, model, print_results=True): """Build the outcome regression model. This is also referred to at the Q-model in various parts of the literature. This must be specified before the fit function. It is encouraged to make this model as flexible as possible Parameters ---------- model : str Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male' print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ if type(self.exposure) is not list: if self.exposure not in model: warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial() elif self.outcome_type == 'normal': linkdist = sm.families.family.Gaussian() else: linkdist = sm.families.family.Poisson() # Modeling the outcome if self._weights is None: m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist) self._outcome_model = m.fit() else: m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist, freq_weights=self.gf[self._weights]) self._outcome_model = m.fit() # Creating predicted Y variable self._predicted_y_ = self._outcome_model.predict(self.gf) # Printing results of the model and if any observations were dropped if print_results: print(self._outcome_model.summary())
def regression(): '''Poisson regression example chapter 4.4, p.69''' # get the data from the web inFile = r'GLM_data/Table 4.3 Poisson regression.xls' df = get_data(inFile) # do the fit p = glm('y~x', family=Poisson(links.identity), data=df) print p.fit().summary()
def senility_and_WAIS(): '''Another example of logistic regression. chapter 7.8, p 143 [tbd]: I don't understand how the "Binomial model" (grouped response) is supposed to work, in either language''' inFile = r'GLM_data/Table 7.8 Senility and WAIS.xls' df = get_data(inFile) # ungrouped model = glm('s ~ x', data=df, family=Binomial()).fit() print model.summary()
def calculate_odds_ratio(genotypes, phen_vector1,phen_vector2,reg_type,covariates,response='',phen_vector3=''): #diff - done """ Runs the regression for a specific phenotype vector relative to the genotype data and covariates. :param genotypes: a DataFrame containing the genotype information :param phen_vector: a array containing the phenotype vector :param covariates: a string containing all desired covariates :type genotypes: pandas DataFrame :type phen_vector: numpy array :type covariates: string .. note:: The covariates must be a string that is delimited by '+', not a list. If you are using a list of covariates and would like to convert it to the pyPhewas format, use the following:: l = ['genotype', 'age'] # a list of your covariates covariates = '+'.join(l) # pyPhewas format The covariates that are listed here *must* be headers to your genotype CSV file. """ data = genotypes data['y']=phen_vector1 data['MaxAgeAtICD'] = phen_vector2 #f='y~'+covariates if response: f = response+'~ y + ' + covariates if phen_vector3.any(): data['phe'] = phen_vector3 f = response + '~ y + phe +' + covariates else: f = 'y ~' + covariates if phen_vector3.any(): data['phe'] = phen_vector3 f = 'y ~ phe +' + covariates try: if reg_type==0: logreg = smf.logit(f,data).fit(method='bfgs',disp=False) p=logreg.pvalues.genotype odds=logreg.deviance conf = logreg.conf_int() od = [-math.log10(p), logreg.params.genotype, '[%s,%s]' % (conf[0]['genotype'],conf[1]['genotype'])] else: linreg = smf.glm(f,data).fit(method='bfgs',disp=False) p=linreg.pvalues.genotype odds=0 conf = linreg.conf_int() od = [-math.log10(p), linreg.params.genotype, '[%s,%s]' % (conf[0]['genotype'],conf[1]['genotype'])] except: odds=0 p=np.nan od = [np.nan,np.nan,np.nan] return (odds,p,od)
def log_linear_models(): '''Log-linear models chapter 9.7, p 180 & 182 ''' # Malignant melanoma, p 180 -------------------------------- inFile = r'GLM_data/Table 9.4 Malignant melanoma.xls' df = get_data(inFile) # Minimal model model_min = glm('frequency~1', family = Poisson(), data=df).fit() print 'Malignant melanoma' print model_min.fittedvalues[0] # Additive model model_add = glm('frequency~site+type', family = Poisson(), data=df).fit() print model_add.fittedvalues[0] # Saturated model # model_sat = glm('frequency~site*type', family = Poisson(), data=df).fit() # # The saturated model gives a perfect fit, and the fitted data are equal to # the original data. Statsmodels indicates a "PerfectSeparationError" # Ulcer and aspirin, p. 182 ------------------------------------- inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls' df = get_data(inFile) df.columns = ['GD', 'CC', 'AP', 'freq'] model1 = glm('freq~GD+CC+GD*CC', family = Poisson(), data=df).fit() model2 = glm('freq~GD+CC+GD*CC + AP', family = Poisson(), data=df).fit() model3 = glm('freq~GD+CC+GD*CC + AP + AP*CC', family = Poisson(), data=df).fit() model4 = glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family = Poisson(), data=df).fit() print 'Ulcer and aspirin' print model4.fittedvalues
def divide_train_set_and_fit(self, full_entities=True): train_data = self.df.ix[self.df['Year'] < 2005, :] test_data = self.df.ix[self.df.Year >= 2005, :] formula = "Direction~Lag1+Lag2" if full_entities is True: formula += "+Lag3+Lag4+Lag5+Volume" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() print result.summary() predict_result = result.predict(exog=test_data) real_val = test_data['Direction'].map(lambda x: 1 if x == 'Down' else 0) self.output_binary_table(result, predict_result, real_val) return result
def general_logistic_regression(): '''Example General Logistic Recression, Example 7.4.1, p. 135''' # Get the data inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls' df = get_data(inFile) # Define the variables so that they match Dobson df['n_y'] = df['n'] - df['y'] df['newstor'] = df['storage']-1 df['x'] = np.log(df['centrifuge']) # Model 1 model1 = glm('n_y + y ~ newstor*x', data=df, family=Binomial()).fit() print model1.summary() # Model 2 model2 = glm('n_y + y ~ newstor+x', data=df, family=Binomial()).fit() print model2.summary() # Model 3 model3 = glm('n_y + y ~ x', data=df, family=Binomial()).fit() print model3 .summary()
def logistic_fit(self, glm_fit=True): ''' The logit function would report error when y(Direction) is not transformed to 0/1 So glm looks easier to use ''' formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume" if glm_fit is True: model = smf.glm(formula, data=self.df, family=sm.families.Binomial()) else: # In fact, this function has wrong fittedvalues, but it's predict value is still right. model = smf.logit(formula, data=self.df) result = model.fit() print result.summary() # In logit fit there are errors here. Not sure why... if glm_fit: self.output_binary_table(result, result.fittedvalues, model.endog.astype(int), glm_fit)
def predict_class_glm(input_file,Output): lvltrace.lvltrace("LVLEntree dans predict_class_glm dans feature_selection") csv=input_file df = pd.read_csv(csv) #print df df = df[['Class','feature_1','feature_2','feature_3','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','feature_10','feature_11','feature_12','feature_13','feature_14','feature_15','feature_16','feature_17','feature_18','feature_19','feature_20','feature_21','feature_22','feature_23','feature_24','feature_25','feature_26','feature_27','feature_28','feature_29','feature_30','feature_31','feature_32','feature_33','feature_34','feature_35','feature_36','feature_37','feature_38','feature_39','feature_40','feature_41','feature_42','feature_43']].dropna() df.head() logit = glm(formula='Class ~ feature_1+feature_2+feature_3+feature_4+feature_5+feature_6+feature_7+feature_8+feature_9+feature_10+feature_11+feature_12+feature_13+feature_14+feature_15+feature_16+feature_17+feature_18+feature_19+feature_20+feature_21+feature_22+feature_23+feature_24+feature_25+feature_26+feature_27+feature_28+feature_29+feature_30+feature_31+feature_32+feature_33+feature_34+feature_35+feature_36+feature_37+feature_38+feature_39+feature_40+feature_41+feature_42+feature_43', data=df).fit() print logit.summary() save = Output + "glm.txt" old_stdout = sys.stdout log_file = open(save,"w") sys.stdout = log_file print logit.summary() sys.stdout = old_stdout log_file.close() lvltrace.lvltrace("LVLSortie dans predict_class_glm dans feature_selection")
def lognorm_glm(self): """ Fit the lognormal distribution to the observed vector of integer values using a generalized linear model. Note: This is a fitted curve; not an actual form of the lognormal distribution This method was inspired by the vegan package's open source code on vegan's public GitHub repository: https://github.com/vegandevs/vegan/R/rad.lognormal.R on Thursday, 5 April 2016 """ ranks = np.log(range(1, len(self.obs)+1)) ranks = -norm.ppf(self.ppoints(len(ranks))) d = pd.DataFrame({'rnks': ranks, 'x': self.obs}) lm = smf.glm(formula='x ~ rnks', data = d, family = sm.genmod.families.family.Poisson(link=sm.genmod.families.links.log)).fit() pred = lm.predict() return pred
def setup_class(cls): import statsmodels.formula.api as smf data = sm.datasets.cpunish.load_pandas() endog = data.endog data = data.exog data['EXECUTIONS'] = endog data['INCOME'] /= 1000 aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1]) model = smf.glm( 'EXECUTIONS ~ INCOME + SOUTH - 1', data=data, family=sm.families.Gaussian(link=sm.families.links.log()), var_weights=aweights ) cls.res1 = model.fit(rtol=1e-25, atol=0) cls.res2 = res_r.results_gaussian_aweights_nonrobust
def from_glm(self): """ Fit the Zipf distribution to the observed vector of integer values using a generalized linear model. Note: This is a fitted curve; not an actual form of the Zipf distribution This method was inspired by the vegan package's open source code on vegan's public GitHub repository: https://github.com/vegandevs/vegan/blob/master/R/rad.zipf.R on Thursday, 19 Marth 2015 """ ranks = np.log(range(1, len(self.obs)+1)) off = [np.log(sum(self.obs))] * len(self.obs) d = pd.DataFrame({'ranks': ranks, 'off': off, 'x':self.obs}) lm = smf.glm(formula='x ~ ranks', data = d, family = sm.families.Poisson()).fit() pred = lm.predict() return pred
def regression(self): from statsmodels.formula.api import glm from statsmodels.api import families self.people.rename( columns={"class": "dbpedia_class"}, inplace=True ) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class" people = self.people[(self.people.birth_century >= 0) & (self.people.birth_century <= 2000)] m = glm( "edition_count ~ C(gender,Treatment(reference='male')) + C(available_english) + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + C(birth_century)", data=people, family=families.NegativeBinomial(), ).fit() print( m.summary(), file=self.logfile ) # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on
def test_formula_missing_exposure(): # see 2083 import statsmodels.formula.api as smf import pandas as pd d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan], 'constant': [1] * 4, 'exposure' : np.random.uniform(size=4), 'x': [1, 3, 2, 1.5]} df = pd.DataFrame(d) family = sm.families.Gaussian(link=sm.families.links.log) mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure, family=family) assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray') exposure = pd.Series(np.random.uniform(size=5)) assert_raises(ValueError, smf.glm, "Foo ~ Bar", data=df, exposure=exposure, family=family) assert_raises(ValueError, GLM, df.Foo, df[['constant', 'Bar']], exposure=exposure, family=family)
def setup_class(cls): import statsmodels.formula.api as smf data = sm.datasets.fair.load_pandas() endog = data.endog data = data.exog data['fair'] = endog aweights = np.repeat(1, len(data.index)) aweights[::5] = 5 aweights[::13] = 3 model = smf.glm( 'fair ~ age + yrs_married', data=data, family=sm.families.Tweedie( var_power=1.55, link=sm.families.links.log() ), var_weights=aweights ) cls.res1 = model.fit(rtol=1e-25, atol=0) cls.res2 = res_r.results_tweedie_aweights_nonrobust
def poisson_regression(): '''Poisson Regression chapter 9.2, p.170 & 171 ''' inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls" df = get_data(inFile) print df # Generate the required variables df['smoke'] = np.zeros(len(df)) df['smoke'][df['smoking']=='smoker']=1 df['agecat'] = np.array([1,2,3,4,5,1,2,3,4,5]) df['agesq'] = df['agecat']**2 df['smkage'] = df['agecat'] df['smkage'][df['smoking']=='non-smoker']=0 model = glm('deaths~agecat+agesq+smoke+smkage', family=Poisson(), data=df, exposure=df["person-years"]).fit() print model.summary()
def rr_cluster(cluster, covs, formula): """Set cluster values to reduced-residuals.""" cluster = deepcopy(cluster) from statsmodels.formula.api import ols, glm if isinstance(cluster[0], CountFeature): for f in cluster: covs['methylation'] = f.methylated f.methylated[:] = np.round(glm(formula, covs, exposure=f.counts, family=Poisson() ).fit().resid ).astype(int) f.values[:] = f.methylated.astype(float) / f.counts else: for f in cluster: covs['methylation'] = f.values fit = ols(formula, covs).fit() f.values[:] = fit.resid f.ovalues = fit.fittedvalues return cluster
def anova(): '''ANOVA chapter 6.4, p. 108, and p. 113 GLM does not work with anova_lm. ''' # get the data from the web inFile = r'GLM_data/Table 6.6 Plant experiment.xls' df = get_data(inFile) # fit the model (p 109) glm = glm('weight~group', family=Gaussian(), data=df) print glm.fit().summary() print '-'*65 print 'OLS' model = ols('weight~group', data=df) print model.fit().summary() print anova_lm(model.fit()) # The model corresponding to the null hypothesis of no treatment effect is model0 = ols('weight~1', data=df) # Get the data for the two-factor ANOVA (p 113) inFile = r'GLM_data/Table 6.9 Two-factor data.xls' df = get_data(inFile) # adjust the header names from the Excel-file df.columns = ['A','B', 'data'] # two-factor anova, with interactions ols_int = ols('data~A*B', data=df) anova_lm(ols_int.fit()) # The python commands for the other four models are ols_add = ols('data~A+B', data=df) ols_A = ols('data~A', data=df) ols_B = ols('data~B', data=df) ols_mean = ols('data~1', data=df)
def setup_class(cls): self = cls import statsmodels.formula.api as smf data = sm.datasets.cpunish.load_pandas() endog = data.endog data = data.exog data['EXECUTIONS'] = endog data['INCOME'] /= 1000 aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1]) model = smf.glm( 'EXECUTIONS ~ INCOME + SOUTH - 1', data=data, family=sm.families.Gaussian(link=sm.families.links.identity()), var_weights=aweights ) wlsmodel = smf.wls( 'EXECUTIONS ~ INCOME + SOUTH - 1', data=data, weights=aweights) self.res1 = model.fit(rtol=1e-25, atol=1e-25) self.res2 = wlsmodel.fit()