def fit_poisson(station_id, include_rebalance = False, initial_time = datetime(2001,1,1), final_time = datetime(2020,1,1), time_interval = '1H'): # Use the correct delta data station_updates = get_station_data(station_id) arrivals_departures = rebalance_station_poisson_data(station_updates, station_id, time_interval, include_rebalance = False) # Create design matrix for months, hours, and weekday vs. weekend. # We can't just create a "month" column to toss into our model, because it doesnt # understand what "June" is. Instead, we need to create a column for each month # and code each row according to what month it's in. Ditto for hours and weekday (=1). y_arr, X_arr = patsy.dmatrices("arrivals ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)", arrivals_departures, return_type='dataframe') y_dep, X_dep = patsy.dmatrices("departures ~ C(months, Treatment) + C(hours, Treatment) + C(weekday_dummy, Treatment)", arrivals_departures, return_type='dataframe') y_dep[pd.isnull(y_dep)] = 0 # Fit poisson distributions for arrivals and departures, print results arr_poisson_model = sm.Poisson(y_arr, X_arr) arr_poisson_results = arr_poisson_model.fit(disp=0) dep_poisson_model = sm.Poisson(y_dep, X_dep) dep_poisson_results = dep_poisson_model.fit(disp = 0) # Calculate Error of the Above Models print type(y_arr-arr_poisson_results.fittedvalues.resid) error = sum((y_arr-arr_poisson_results.fittedvalues)**2)+sum((y_dep-dep_poisson_results.fittedvalues)**2) # print arr_poisson_results.summary(), dep_poisson_results.summary() poisson_results = [arr_poisson_results, dep_poisson_results, error] return poisson_results
def pandashandler(formula_like, data): """ process a pysal model signature and convert an equation/formula pair into a pysal-specific object """ if '||' in formula_like: mu, inst = formula_like.split('||') y, X = p.dmatrices(mu + '-1' , data=data) yend, q = p.dmatrices(inst + '-1', data=data) rargs = [y,X,yend,q] rargs = [asarray(i) for i in rargs] name_y, name_x = mu.strip(' ').split('~') name_x = name_x.split('+') name_yend, name_q = inst.strip(' ').split('~') name_yend = [name_yend] name_q = name_q.split('+') names = {"name_y":name_y, "name_x":name_x, "name_yend":name_yend, "name_q":name_q} else: y, X = p.dmatrices(formula_like + '-1', data=data) rargs = [asarray(y), asarray(X)] name_y, name_x = formula_like.strip(' ').split('~') name_x = name_x.split('+') names = {"name_y":name_y, "name_x":name_x} return rargs, names
def stepwiseInit(upperScope, dataFrame, lowerScope=None, startScope=None, trace=False, traceFile=stdout, groupVars=False, penaltyFn=stepwise_penalties.AICc()) : #The first set of operations sets up the lower and upper scopes and infers defaults #if they are not given env = patsy.EvalEnvironment.capture() upperScopeDesc = patsy.ModelDesc.from_formula(upperScope, env) startScopeDesc = None if startScope is None else ModelDesc.from_formula(startScope,env) lowerScopeDesc = None if lowerScope is None else ModelDesc.from_formula(lowerScope,env) if not lowerScope and patsy.Term([]) not in upperScopeDesc.rhs_termlist : raise StepwiseError("A lower scope of the model search must be specified when " + \ "the upperScope does not contain an intercept") if not lowerScope : #build a formula with only an intercept lowerScopeDesc = patsy.ModelDesc(upperScopeDesc.lhs_termlist,[patsy.Term([])]) lowerScope = lowerScopeDesc.describe() if not startScope : startScopeDesc = lowerScopeDesc startScope = lowerScopeDesc.describe() #TODO: check that lower scope is consistant with upper scope #TODO: check that startingscope is consistent with lower and upper scopes rhs_set = set(upperScopeDesc.rhs_termlist) for item in lowerScopeDesc.rhs_termlist : if item not in rhs_set : raise StepWiseError("term " + item + " from formula:\n" + \ lowerScope + "\nnot found in:\n" + \ upperScope) for item in startScopeDesc.rhs_termlist : if item not in rhs_set : raise StepWiseError("term " + item + " from formula:\n" + \ startScope + "\nnot found in:\n" + \ upperScope) y,X = patsy.dmatrices(upperScope, data=dataFrame) y,Xprime = patsy.dmatrices(startScope, data=dataFrame) y,Xlower = patsy.dmatrices(lowerScope, data=dataFrame) active = np.zeros(X.shape[1],dtype=np.bool) lower_active = active.copy() lowerMsk = active.copy() assert y.shape[1] == 1, "Multiple responses not yet supported." y = y.flatten() featMap = dict([(name,index) for index,name in enumerate(X.design_info.column_names)]) for feat in Xprime.design_info.column_names : active[featMap[feat]] = True for feat in Xlower.design_info.column_names : lower_active[featMap[feat]] = True #next step: fit model using only the active set of features beta, betaSigmaSq, SSE, df, Q = qr_based_solver.solve(X[:,active],y.flatten()) residWithMean = y - np.mean(y) SSTO = np.dot(residWithMean,residWithMean) summary = computeSummary(beta,betaSigmaSq,SSE,SSTO,X.shape[0],df, Xprime.design_info.column_names,startScope) return StepwiseFitter(LinearModelFit(summary.beta,summary),X,y,X.design_info.column_names, upperScopeDesc.lhs_termlist,active,lower_active,penaltyFn,trace=trace, traceFile=traceFile, groupVars=groupVars)
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None if len(result) > 1: # have RHS design design_info = result[1].design_info # detach it from DataFrame else: design_info = None # NOTE: is there ever a case where we'd need LHS design_info? return result, missing_mask, design_info
def gen_predictors(self): """Generates predictors data frame""" model = read_csv(self.model_file) _, predictors = dmatrices(self.formula, model) self.di = predictors.design_info self.predictors = DataFrame(predictors, columns=self.di.column_name_indexes) return self.predictors
def __init__(self, model, *args, lazy=False): """ Initialize a linear model Parameters ========== model: str model string args: argument list Returns ======= No return Raises ====== LogicalError If model is wrong """ self.modelstr = model self.model = dmatrices(model, *args) if len(self.model) != 2: raise LogicalError("Invalid model specification, should have variables either side") if len(self.model[0][1]) != 1: #TODO add support to multiple responses later raise LogicalError("Multiple responses regression is not supported") self._regress()
def vcfassoc(formula, covariate_df, groups=None): y, X = patsy.dmatrices(str(formula), covariate_df, return_type='dataframe') # get the column containing genotype ix = get_genotype_ix(X) Binomial = sm.families.Binomial logit = sm.families.links.Logit() if groups is not None: #covariate_df['grps'] = map(str, range(len(covariate_df) / 8)) * 8 if not isinstance(groups, (pd.DataFrame, np.ndarray)): cov = Exchangeable() model = sm.GEE(y, X, groups=covariate_df[groups], cov_struct=cov, family=Binomial()) else: model = sm.GLS(logit(y), X, sigma=groups.ix[X.index, X.index]) else: model = sm.GLM(y, X, missing='drop', family=Binomial()) result = model.fit(maxiter=1000) res = {'OR': np.exp(result.params[ix]), 'pvalue': result.pvalues[ix], 'z': result.tvalues[ix], 'OR_CI': tuple(np.exp(result.conf_int().ix[ix, :])), } try: res['df_resid'] = result.df_resid except AttributeError: pass return res
def xtab(formula, covariate_df): y, X = patsy.dmatrices(str(formula), covariate_df) X = patsy.dmatrix('genotype', covariate_df) ix = get_genotype_ix(X) tbl = pd.crosstab(X[:, ix], y.ravel()) try: tbl.columns = ['%s_%i' % (y.design_info.column_names[-1], j) for j in range(2)] except: return None # too few samples tbl.index = ['%i_alts' % i for i in tbl.index] alts = set(tbl.index) if len(alts) < 2 or not '0_alts' in alts: tbl_dom = None else: tbl_dom = pd.DataFrame({'0_alts': tbl.ix['0_alts', :], 'n_alts': tbl.ix[list(alts - set(['0_alts'])), :].sum()}).T # can't test recessive without any homoz alts. if not '2_alts' in alts or len(alts) < 2: tbl_rec = None else: tbl_rec = pd.DataFrame({'lt2_alts': tbl.ix[['0_alts', '1_alts'], :].sum(), '2_alts': tbl.ix['2_alts', :]}) d = {} for name, xtbl in (('additive', tbl), ('dominant', tbl_dom), ('recessive', tbl_rec)): if xtbl is None: d['p.chi.%s' % name] = 'nan' continue chi, p, ddof, e = chi2_contingency(xtbl) if name == 'additive': d = xtbl.to_dict() d['p.chi.%s' % name] = "%.3g" % p return d
def randforpat(): df = pd.read_csv("train.csv") cleanpatsy(df) y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe") y = np.ravel(y) forest = RandomForestClassifier(n_estimators=100) forest = forest.fit( X,y ) print forest.score(X, y) # # evaluate the model by splitting into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model2 = RandomForestClassifier(n_estimators = 100) model2.fit(X_train, y_train) predicted = model2.predict(X_test) print metrics.accuracy_score(y_test, predicted) dftest = pd.read_csv("test.csv") cleanpatsy(dftest) X = dmatrix('Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',dftest, return_type="dataframe") output = forest.predict(X).astype(int) result = {'PassengerId':dftest.PassengerId, 'Survived':output} dfresult = pd.DataFrame(result) dfresult.to_csv("result.csv",index=False)
def from_formula(cls, formula, data, priors=None, vars=None, family='normal', name='', model=None): import patsy y, x = patsy.dmatrices(formula, data) labels = x.design_info.column_names return cls(np.asarray(x), np.asarray(y)[:, 0], intercept=False, labels=labels, priors=priors, vars=vars, family=family, name=name, model=model)
def predict(self,h=5,oos_data=None): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? oos_data : pd.DataFrame Data to use for the predictors in the forecast Returns ---------- - pd.DataFrame with predicted values """ if self.latent_variables.estimated is False: raise Exception("No latent variables estimated!") else: _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] X_pred = X_oos[:h] sigma2, Y, scores, _ = self._model(self.latent_variables.get_z_values()) date_index = self.shift_dates(h) t_params = self.transform_z() mean_values = self._mean_prediction(sigma2,Y,scores,h,t_params,X_pred) forecasted_values = mean_values[-h:] result = pd.DataFrame(np.exp(forecasted_values/2.0)) result.rename(columns={0:self.data_name}, inplace=True) result.index = date_index[-h:] return result
def main(): train_df_filled=fill_null_vals(train_df,'Fare') train_df_filled=fill_null_vals(train_df_filled,'Age') assert len(train_df_filled)==len(train_df) test_df_filled=fill_null_vals(test_df,'Fare') test_df_filled=fill_null_vals(test_df_filled,'Age') assert len(test_df_filled)==len(test_df) for formula_name, formula in formula_map.iteritems(): print "name=%s formula=%s" % (formula_name,formula) y_train,X_train = dmatrices('Survived ~ ' + formula, train_df_filled,return_type='dataframe') print "Running logistic regression with formula : %s" % formula print "X_train cols=%s " % X_train.columns y_train = np.ravel(y_train) model = LogisticRegression() lr_model = model.fit(X_train, y_train) print "Training score:%s" % lr_model.score(X_train,y_train) X_test=dmatrix(formula,test_df_filled) predicted=lr_model.predict(X_test) print "predicted:%s\n" % predicted[:5] assert len(predicted)==len(test_df) pred_results=pd.Series(predicted,name='Survived') lr_results=pd.concat([test_df['PassengerId'],pred_results],axis=1) lr_results.Survived=lr_results.Survived.astype(int) results_file='csv/logisticregr_%s.csv' % formula_name #results_file = re.sub('[+ ()C]','',results_file) lr_results.to_csv(results_file,index=False)
def __new__(cls, formula, data, priors=None, intercept_prior=None, regressor_prior=None, init_vals=None, family='normal', model=None, name=''): _families = dict( normal=families.Normal, student=families.StudentT, binomial=families.Binomial, poisson=families.Poisson ) if isinstance(family, str): family = _families[family]() y_data = np.asarray(patsy.dmatrices(formula, data)[0]).T y_est, coeffs = linear_component( formula, data, priors=priors, intercept_prior=intercept_prior, regressor_prior=regressor_prior, init_vals=init_vals, model=model, name=name ) family.create_likelihood(name, y_est, y_data, model=model) return super(glm, cls).__new__(cls, y_est, coeffs)
def __setstate__(self, d): if "restore_design_info" in d: # NOTE: there may be a more performant way to do this from patsy import dmatrices, PatsyError exc = [] try: data = d['frame'] except KeyError: data = d['orig_endog'].join(d['orig_exog']) for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it try: _, design = dmatrices(d['formula'], data, eval_env=depth, return_type='dataframe') break except (NameError, PatsyError) as e: print('not in depth %d' % depth) exc.append(e) # why do I need a reference from outside except block pass else: raise exc[-1] self.design_info = design.design_info del d["restore_design_info"] self.__dict__.update(d)
def pse_perSs_perCond(data,combos): ''' Returns dict of PSE per conditions per subject. data: recarray of data with trials as level of analysis combos: dict of combinations of keys from other columns of data you want PSE per keys: label you want per PSE (probably of combinations of keys/conditions) values: dict of column names (keys) and values (values) ''' ssdat = [] for s in np.unique(data['subjid']): for c,combo in combos.iteritems(): #slice data slicer = data['subjid'] == s for col in combo: slicer *= data[col]==combo[col] dsliced = data[slicer] # Prepare the data file = pd.DataFrame({'non2targ': list(dsliced['non2targ']) , 'morph':list(dsliced['morph']-6)}) y,X = dmatrices('non2targ ~ morph',file) y = np.ravel(y) # Fit the data to Logistic Regression model model = LogisticRegression() model = model.fit(X,y) pse = -1 * (model.coef_[0][0]/model.coef_[0][1]) if np.isfinite(pse)==False or np.isnan(pse)==True: raise NameError('NaN or nonfinite return') #ssdat[s][c] = pse ssdat.append([s,c,float(pse)]) return ssdat
def __init__(self,data,p,q,formula): # Initialize TSM object super(EGARCHMReg,self).__init__('EGARCHMReg') # Latent variables self.p = p self.q = q self.max_lag = max(self.p,self.q) self.z_no = self.p + self.q + 2 self._z_hide = 0 # Whether to cutoff variance latent variables from results self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"] self.default_method = "MLE" self.multivariate_model = False self.leverage = False self.model_name = "EGARCHMReg(" + str(self.p) + "," + str(self.q) + ")" # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data self.formula = formula self.y, self.X = dmatrices(formula, data) self.z_no += self.X.shape[1]*2 self.y_name = self.y.design_info.describe() self.data_name = self.y_name self.X_names = self.X.design_info.describe().split(" + ") self.y = np.array([self.y]).ravel() self.data = self.y self.X = np.array([self.X])[0] self.index = data.index self.initial_values = np.zeros(self.z_no) self._create_latent_variables()
def main(): train_df_filled=fill_null_vals(train_df,'Fare') train_df_filled=fill_null_vals(train_df_filled,'Age') assert len(train_df_filled)==len(train_df) test_df_filled=fill_null_vals(test_df,'Fare') test_df_filled=fill_null_vals(test_df_filled,'Age') assert len(test_df_filled)==len(test_df) num_estimators=10000 for formula_name, formula in formula_map.iteritems(): print "name=%s formula=%s" % (formula_name,formula) y_train,X_train = dmatrices('Survived ~ ' + formula, train_df_filled,return_type='dataframe') print "Running RandomForestClassifier with formula : %s" % formula print "X_train cols=%s " % X_train.columns y_train = np.ravel(y_train) model = RandomForestClassifier(n_estimators=num_estimators, random_state=0) print "About to fit..." rf_model = model.fit(X_train, y_train) print "Training score:%s" % rf_model.score(X_train,y_train) X_test=dmatrix(formula,test_df_filled) predicted=rf_model.predict(X_test) print "predicted:%s" % predicted[:5] assert len(predicted)==len(test_df) pred_results=pd.Series(predicted,name='Survived') rf_results=pd.concat([test_df['PassengerId'],pred_results],axis=1) rf_results.Survived=rf_results.Survived.astype(int) results_file='csv/rf_%s_n_est_%s.csv' % (formula_name,num_estimators) print "output file: %s\n" % results_file #results_file = re.sub('[+ ()C]','',results_file) rf_results.to_csv(results_file,index=False)
def predict(self,h=5,oos_data=None): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - pd.DataFrame with predicted values """ if self.parameters.estimated is False: raise Exception("No parameters estimated!") else: # Sort/manipulate the out-of-sample data _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] X_pred = X_oos[:h] mu, Y = self._model(self.parameters.get_parameter_values()) date_index = self.shift_dates(h) t_params = self.transform_parameters() mean_values = self._mean_prediction(mu,Y,h,t_params,X_pred) forecasted_values = mean_values[-h:] result = pd.DataFrame(forecasted_values) result.rename(columns={0:self.data_name}, inplace=True) result.index = date_index[-h:] return result
def __init__(self,formula,data): # Initialize TSM object super(DynLin,self).__init__('DynLin') # Parameters self.max_lag = 0 self._param_hide = 0 # Whether to cutoff variance parameters from results self.supported_methods = ["MLE","PML","Laplace","M-H","BBVI"] self.default_method = "MLE" self.model_name = "Dynamic Linear Regression" self.multivariate_model = False # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data self.formula = formula self.y, self.X = dmatrices(formula, data) self.param_no = self.X.shape[1] + 1 self.y_name = self.y.design_info.describe() self.data_name = self.y_name self.X_names = self.X.design_info.describe().split(" + ") self.y = np.array([self.y]).ravel() self.data = self.y self.X = np.array([self.X])[0] self.index = data.index self._create_parameters()
def main(): train_df_filled=fill_null_vals(train_df,'Fare') train_df_filled=fill_null_vals(train_df_filled,'Age') assert len(train_df_filled)==len(train_df) test_df_filled=fill_null_vals(test_df,'Fare') test_df_filled=fill_null_vals(test_df_filled,'Age') assert len(test_df_filled)==len(test_df) for formula_name, formula in formula_map.iteritems(): print "name=%s formula=%s" % (formula_name,formula) y_train,X_train = dmatrices('Survived ~ ' + formula, train_df_filled,return_type='dataframe') print "Running DecisionTreeClassifier with formula : %s" % formula print "X_train cols=%s " % X_train.columns y_train = np.ravel(y_train) model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5) print "About to fit..." dt_model = model.fit(X_train, y_train) print "Training score:%s" % dt_model.score(X_train,y_train) X_test=dmatrix(formula,test_df_filled) predicted=dt_model.predict(X_test) print "predicted:%s" % predicted[:5] assert len(predicted)==len(test_df) pred_results=pd.Series(predicted,name='Survived') dt_results=pd.concat([test_df['PassengerId'],pred_results],axis=1) dt_results.Survived=dt_results.Survived.astype(int) results_file='csv/dt_%s.csv' % (formula_name) print "output file: %s\n" % results_file #results_file = re.sub('[+ ()C]','',results_file) dt_results.to_csv(results_file,index=False)
def pandashandler(formula_like, data): """ process a pysal model signature and convert an equation/formula pair into a pysal-specific object """ if '||' in formula_like: mu, inst = formula_like.split('||') y, X = p.dmatrices(mu + '-1' , data=data) yend, q = p.dmatrices(inst + '-1', data=data) rargs = [y,X,yend,q] rargs = [asarray(i) for i in rargs] else: y, X = p.dmatrices(formula_like + '-1', data=data) rargs = [asarray(y), asarray(X)] return rargs
def logistic_regression(data): y, X = dmatrices(LR_FORMULA, data) y = np.ravel(y) model = LogisticRegression(penalty='l1', C=0.1, fit_intercept=True) model = model.fit(X, y) print model.score(X, y) return model
def ready_for_model(df): cols = list(df.columns.values) # keep columns cols_keep = [] cols_giveup = [] for c in df: if df[c].dtype in [int, float]: cols_keep.append(c) elif df[c].dtype == object: if df[c].nunique() < 25: cols_keep.append(c) else: cols_giveup.append(c) # remove the labels for to_remove in ['id', 'status', 'status_group']: cols_keep.remove(to_remove) # convert df to X, y by patsy r_formula = 'status ~' + ' + '.join(cols_keep) df_y, df_X = patsy.dmatrices(r_formula, df, return_type='dataframe') cols_X = df_X.columns X = df_X.values y = df_y.values return (X, y, cols_X, r_formula, cols_keep, cols_giveup)
def logisticpatsy(): df = pd.read_csv("train.csv") cleanpatsy(df) #y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe") y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe") y = np.ravel(y) model = LogisticRegression() model = model.fit(X, y) # check the accuracy on the training set print model.score(X, y) # # evaluate the model by splitting into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model2 = LogisticRegression() model2.fit(X_train, y_train) predicted = model2.predict(X_test) print metrics.accuracy_score(y_test, predicted) dftest = pd.read_csv("test.csv") cleanpatsy(dftest) X = dmatrix('Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',dftest, return_type="dataframe") predict_survive = model.predict(X) result = {'PassengerId':dftest.PassengerId, 'Survived':predict_survive} dfresult = pd.DataFrame(result) dfresult.to_csv("result.csv",index=False) print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
def nominal_logistic_regression(): '''Nominal Logistic Regression chapter 8.3, p. 155 At this point, nominal logistic regression cannot be done with the formula approach. Regarding the output, note that R produces log(pi2/pi1) and log(pi3/pi1), while statsmodels produces log(pi2/pi1) and log(pi3/pi2) ''' # Get the data inFile = r'GLM_data/Table 8.1 Car preferences.xls' df = get_data(inFile) # to make sure that "women" and "no/little" are the reference, # adjust them such that they come first alphabetically df['response'][df['response'] == 'no/little'] = '_no/little' df['sex'][df['sex'] == 'women'] = '_women' print df # Generate the design matrices using patsy pm = patsy.dmatrices('response~sex+age', data=df) # Generate the endog and exog matrices endog = np.repeat(np.array(df['response']), df['frequency'].values.astype(int), axis=0) exog = np.array(np.repeat(pm[1], df['frequency'].values.astype(int), axis=0)) exog = pd.DataFrame(exog, columns=pm[1].design_info.column_names) # Fit the model, and print the summary model = sm.MNLogit(endog, exog, method='nm').fit() print model.summary()
def predict(self,h=5,oos_data=None): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - pd.DataFrame with predicted values """ if self.parameters.estimated is False: raise Exception("No parameters estimated!") else: # Sort/manipulate the out-of-sample data _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] X_pred = X_oos[:h] date_index = self.shift_dates(h) _, _, _, coefficients = self._model(self.parameters.get_parameter_values()) coefficients_star = coefficients.T[-1] theta_pred = np.dot(np.array([coefficients_star]), X_pred.T)[0] result = pd.DataFrame(self.link(theta_pred)) result.rename(columns={0:self.y_name}, inplace=True) result.index = date_index[-h:] return result
def test_from_formula_vs_no_formula(): mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data) r = mod.fit(method="svd") r0 = r.mv_test() endog, exog = patsy.dmatrices( "Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data, return_type="dataframe" ) L = np.array([[1, 0, 0, 0, 0, 0]]) # DataFrame input r = _MultivariateOLS(endog, exog).fit(method="svd") r1 = r.mv_test(hypotheses=[["Intercept", L, None]]) assert_array_almost_equal(r1["Intercept"]["stat"].values, r0["Intercept"]["stat"].values, decimal=6) # Numpy array input r = _MultivariateOLS(endog.values, exog.values).fit(method="svd") r1 = r.mv_test(hypotheses=[["Intercept", L, None]]) assert_array_almost_equal(r1["Intercept"]["stat"].values, r0["Intercept"]["stat"].values, decimal=6) L = np.array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]) r1 = r.mv_test(hypotheses=[["Drug", L, None]]) # DataFrame input r = _MultivariateOLS(endog, exog).fit(method="svd") r1 = r.mv_test(hypotheses=[["Drug", L, None]]) assert_array_almost_equal(r1["Drug"]["stat"].values, r0["Drug"]["stat"].values, decimal=6) # Numpy array input r = _MultivariateOLS(endog.values, exog.values).fit(method="svd") r1 = r.mv_test(hypotheses=[["Drug", L, None]]) assert_array_almost_equal(r1["Drug"]["stat"].values, r0["Drug"]["stat"].values, decimal=6)
def generate_data(n, loss, wt_param=2, return_rate=False): """ Generate random data for testing :param n: :param loss: :param wt_param: :return: """ w = np.random.randint(1, wt_param, n) if loss == 'squared': y = np.random.normal(50, 100, size=n) if loss == 'logistic': # Binomial - n numper of trials, p probability [p_or_label trials] - response if return_rate: y = [sum(np.random.binomial(1, 0.1, 100) == 1) / 100.0 for _ in xrange(n)] else: y = np.random.binomial(1, 0.1, n) if loss == 'poisson': # Poisson - lambda expected events in an interval [avg_no_of_events_or_rate trials] - response if return_rate: y = [count*1.0/trials for count, trials in zip(np.random.poisson(10, size=n), w)] else: y = np.random.poisson(10, size=n) d = {'value': y, 'feature1': [np.random.choice(['a', 'b', 'c']) for _ in xrange(n)], 'feature2': [np.random.choice(['pp', 'qq']) for _ in xrange(n)]} df = pd.DataFrame(d) out = ptsy.dmatrices('value ~ feature1 + feature2', data=df, return_type='dataframe') y, X = out return w, y, X
def __init__(self,data,eqn,**kwargs): self.dmatrices=patsy.dmatrices(eqn, data) self.eqn=eqn self.y=np.array(self.dmatrices[0]) self.X=np.array(self.dmatrices[1]) self.column_names=self.dmatrices[1].design_info.column_names MCMCModel_Meta.__init__(self,**kwargs) self.index={} self.keys=[] self.params={} count=0 for paramname in ['beta_%d' % _ for _ in range(len(self.column_names))]: if paramname in kwargs: self.params[paramname]=kwargs[paramname] else: self.params[paramname]=Normal(0,10) self.index[paramname]=count self.keys.append(paramname) count+=1 if 'sigma' in kwargs: self.params['_sigma']=kwargs['sigma'] else: self.params['_sigma']=Jeffries() self.keys.append('_sigma') self.index['_sigma']=len(self.keys)-1
def main(): fname = "loans_imputed.csv" df = pd.read_csv(fname) #print df.describe() df.hist() plt.show() # clean up the dataframe df.rename(columns={'not.fully.paid': 'not_fully_paid', 'credit.policy': 'credit_policy', 'int.rate': 'int_rate', 'log.annual.inc': 'log_annual_inc', 'days.with.cr.line': 'days_with_cr_line', 'revol.bal': 'revol_bal', 'inq.last.6mths': 'inq_last_6mths', 'delinq.2yrs': 'delinq_2yrs', 'pub.rec': 'pub_rec'}, inplace=True) y, X = dmatrices('not_fully_paid ~ credit_policy + int_rate + \ installment + log_annual_inc + dti + \ days_with_cr_line + revol_bal + inq_last_6mths + \ delinq_2yrs + pub_rec', df, return_type='dataframe') model = LogisticRegression() model.fit(X, y) predict = model.predict(X) print print print 'Model accuracy: %f' % (model.score(X, y) * 100.0) print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
v = np.random.normal(0, var_v, n)**3 #create a pandas dataframe (easily parseable object for manipulation) A = pd.DataFrame({'x': x, 'z': z, 'v': v}) #compute the log odds for our 3 independent variables #using the sigmoid function A['log_odds'] = sigmoid(A[['x', 'z', 'v']].dot([beta_x, beta_z, beta_v]) + sigma * np.random.normal(0, 1, n)) #compute the probability sample from binomial distribution #A binomial random variable is the number of successes x has in n repeated trials of a binomial experiment. #The probability distribution of a binomial random variable is called a binomial distribution. A['y'] = [np.random.binomial(1, p) for p in A.log_odds] #create a dataframe that encompasses our input data, model formula, and outputs y, X = dmatrices(formula, A, return_type='dataframe') #print it X.head(100) #like dividing by zero (Wtff omgggggg universe collapses) def catch_singularity(f): '''Silences LinAlg Errors and throws a warning instead.''' def silencer(*args, **kwargs): try: return f(*args, **kwargs) except np.linalg.LinAlgError: warnings.warn('Algorithm terminated - singular Hessian!') return args[0]
def ModelLogisticReg(self): print '+++++++++++++++++++++++++ LOGISTIC REGRESSION 1 +++++++++++++++++++++++++' # Read csv file.. First get handle comLRHandle = ReadCSV.Read_CSV() data = comLRHandle.Read(self.path) # Let count the data and check if any missing info/value #print data.count(0) #PassengerId 891 #Survived 891 #Pclass 891 #Name 891 #Sex 891 #Age 714 #SibSp 891 #Parch 891 #Ticket 891 #Fare 891 #Cabin 204 #Embarked 889 # We need to remove Name, Cabin and Ticket because these are not useful data = data.drop(['Ticket', 'Cabin', 'Name'], axis=1) # Drop Na also.. We may fill them with avg/some other method.. but lets drop for now data = data.dropna() #We'll use a Python package called Patsy, which helps in describing statistical models. #It helps in defining a dependent and independent variable formula that is similar to #R. The variable that is defined left of '~' is the dependent variable, and the variable #that is defined to right of it are the independent variables. The variables enclosed #within C() are treated as categorical variables. formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + C(Embarked) + Parch' # Lets create a dictionary to hold regression result for easy analysis DFTdataX = data.iloc[0:600, :] # take first 600 samples (For Training) DFVdataX = data.iloc[600:, :] # take remaining samples (For Testing) # Splitting the data into dependent and independent variables TdataY, TdataX = patsy.dmatrices(formula, data=DFTdataX, return_type='dataframe') VdataY, VdataX = patsy.dmatrices(formula, data=DFVdataX, return_type='dataframe') # Let instantiate out model using stats package LogistModel = sm.Logit(TdataY, TdataX) # Execute model to let it fit ResLogModel = LogistModel.fit() #print ResLogModel.summary() #Logit Regression Results #============================================================================== #Dep. Variable: Survived No. Observations: 600 #Model: Logit Df Residuals: 591 #Method: MLE Df Model: 8 #Date: Fri, 13 Nov 2015 Pseudo R-squ.: 0.3333 #Time: 22:39:44 Log-Likelihood: -270.02 #converged: True LL-Null: -404.99 #LLR p-value: 1.009e-53 #==================================================================================== # coef std err z P>|z| [95.0% Conf. Int.] #------------------------------------------------------------------------------------ #-Intercept 4.3332 0.510 8.490 0.000 3.333 5.334 #-C(Pclass)[T.2] -1.2030 0.325 -3.703 0.000 -1.840 -0.566 #-C(Pclass)[T.3] -2.4673 0.320 -7.705 0.000 -3.095 -1.840 #-C(Sex)[T.male] -2.6312 0.244 -10.797 0.000 -3.109 -2.154 #+C(Embarked)[T.Q] -0.4359 0.647 -0.674 0.501 -1.704 0.832 #+C(Embarked)[T.S] -0.2910 0.297 -0.980 0.327 -0.873 0.291 #-Age -0.0397 0.009 -4.464 0.000 -0.057 -0.022 #-SibSp -0.3202 0.136 -2.354 0.019 -0.587 -0.054 #+Parch -0.1420 0.136 -1.041 0.298 -0.409 0.125 #==================================================================================== # As we can see Psudo R-Squ.=0.333, it is good.. any error between 0.2-0.4 is OK # As we can see also Embarktion and Parch has P>0.050 these people don't have much # significance over predication. Well, we offcourse want few predictors, let re-design # our formula and see what happens # Lets update formula again.. removing Embarked and Parch formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp ' # Splitting the data into dependent and independent variables TdataY, TdataX = patsy.dmatrices(formula, data=DFTdataX, return_type='dataframe') VdataY, VdataX = patsy.dmatrices(formula, data=DFVdataX, return_type='dataframe') # Let instantiate out model using stats package LogistModel = sm.Logit(TdataY, TdataX) # Execute model to let it fit ResLogModel = LogistModel.fit() #print ResLogModel.summary() #Logit Regression Results #============================================================================== #Dep. Variable: Survived No. Observations: 600 #Model: Logit Df Residuals: 594 #Method: MLE Df Model: 5 #Date: Sun, 15 Nov 2015 Pseudo R-squ.: 0.3307 #Time: 12:26:13 Log-Likelihood: -271.08 #converged: True LL-Null: -404.99 # LLR p-value: 8.172e-56 #================================================================================== # coef std err z P>|z| [95.0% Conf. Int.] #---------------------------------------------------------------------------------- #-Intercept 4.1050 0.479 8.575 0.000 3.167 5.043 #-C(Pclass)[T.2] -1.2971 0.306 -4.242 0.000 -1.896 -0.698 #-C(Pclass)[T.3] -2.5739 0.305 -8.433 0.000 -3.172 -1.976 #-C(Sex)[T.male] -2.5808 0.235 -10.996 0.000 -3.041 -2.121 #-Age -0.0401 0.009 -4.549 0.000 -0.057 -0.023 #-SibSp -0.3691 0.130 -2.840 0.005 -0.624 -0.114 #================================================================================== # We can see that all the predictors are significant in the preceding model. # Let evaluate the model and see how good it works with validation/testing data # We will use Kernel Density Estimation kde_res = sm.nonparametric.KDEUnivariate(ResLogModel.predict()) kde_res.fit() #plt.plot(kde_res.support,kde_res.density) #plt.fill_between(kde_res.support,kde_res.density, alpha=0.2) #plt.title("Distribution of our Predictions") #plt.show() # From image we can see most of the distribution (highest density) is over 0. That means # most of the people had died. This is true in case of titanic dataset. # Let's see the prediction distribution based on the male gender: #plt.scatter(ResLogModel.predict(),TdataX['C(Sex)[T.male]'] , alpha=0.2) #plt.grid(b=True, which='major', axis='x') #plt.xlabel("Predicted chance of survival") #plt.ylabel("Male Gender") #plt.title("The Change of Survival Probability by Gender being Male") #plt.show() # As we can see from image, probability of survival is high for female compare to male. # Now, let's see the distribution of the prediction based on the lower class of the passengers: #plt.scatter(ResLogModel.predict(),TdataX['C(Pclass)[T.3]'] , alpha=0.2) #plt.xlabel("Predicted chance of survival") #plt.ylabel("Class Bool") # Boolean class to show if its 3rd class #plt.grid(b=True, which='major', axis='x') #plt.title("The Change of Survival Probability by Lower Class which is 3rd class") #plt.show() # We can see from image, lower class people has lower chance of survival compare to uper class. # More money can save you... # Let's see the distribution of the probability with respect to the age of the passengers: #plt.scatter(ResLogModel.predict(),TdataX.Age , alpha=0.2) #plt.grid(True, linewidth=0.15) #plt.title("The Change of Survival Probability by Age") #plt.xlabel("Predicted chance of survival") #plt.ylabel("Age") #plt.show() # If we see the graph. There are two outcomes... # 1. Small children of age around 0-1 year has predicted chance of survival spread over full range. # 2. As the age increase, chance of survival go to left of graph which is less chance of survival. # but this graph is distribution over wide range of Age unlike above 2 graphs (binary). # Let's see the distribution of the probability with respect to the number of siblings/spouses: #plt.scatter(ResLogModel.predict(),TdataX.SibSp , alpha=0.2) #plt.grid(True, linewidth=0.15) #plt.title("The Change of Survival Probability by Number of siblings/spouses") #plt.xlabel("Predicted chance of survival") #plt.ylabel("No. of Siblings/Spouses") #plt.show() # Less the family member on board.. more the chances of survival. ## Evaluating a model based on test data ## y_pred = ResLogModel.predict(VdataX) y_pred_flag = y_pred > 0.7 print '------------------------------------------------------------------------------------------' print pd.crosstab(VdataY.Survived, y_pred_flag, rownames=['Actual'], colnames=['Predicted']) print '------------------------------------------------------------------------------------------' print classification_report(VdataY, y_pred_flag) print '------------------------------------------------------------------------------------------'
def predict(self, h=5, intervals=False, oos_data=None, **kwargs): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? intervals : boolean (default: False) Whether to return prediction intervals oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - pd.DataFrame with predictions """ nsims = kwargs.get('nsims', 200) if self.latent_variables.estimated is False: raise Exception("No latent variables estimated!") else: _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] full_X = self.X.copy() full_X = np.append(full_X,X_oos,axis=0) Z = full_X date_index = self.shift_dates(h) # Retrieve data, dates and (transformed) latent variables if self.latent_variables.estimation_method in ['M-H']: lower_1_final = 0 upper_99_final = 0 lower_5_final = 0 upper_95_final = 0 forecasted_values_final = 0 for i in range(nsims): t_params = self.draw_latent_variables(nsims=1).T[0] a, P = self._forecast_model(t_params, Z, h) smoothed_series = np.zeros(h) series_variance = np.zeros(h) for t in range(h): smoothed_series[t] = np.dot(Z[self.y.shape[0]+t],a[:,self.y.shape[0]+t]) series_variance[t] = np.dot(np.dot(Z[self.y.shape[0]+t],P[:,:,self.y.shape[0]+t]),Z[self.y.shape[0]+t].T) forecasted_values = smoothed_series lower_5 = smoothed_series - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5) upper_95 = smoothed_series + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5) lower_5_final += lower_5 upper_95_final += upper_95 lower_1 = smoothed_series - 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5) upper_99 = smoothed_series + 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5) lower_1_final += lower_1 upper_99_final += upper_99 forecasted_values_final += forecasted_values forecasted_values_final = forecasted_values_final / nsims lower_1_final = lower_1_final / nsims lower_5_final = lower_5_final / nsims upper_95_final = upper_95_final / nsims upper_99_final = upper_99_final / nsims if intervals is False: result = pd.DataFrame(forecasted_values_final) result.rename(columns={0:self.data_name}, inplace=True) else: prediction_05 = lower_5_final prediction_95 = upper_95_final prediction_01 = lower_1_final prediction_99 = upper_99_final result = pd.DataFrame([forecasted_values_final, prediction_01, prediction_05, prediction_95, prediction_99]).T result.rename(columns={0:self.data_name, 1: "1% Prediction Interval", 2: "5% Prediction Interval", 3: "95% Prediction Interval", 4: "99% Prediction Interval"}, inplace=True) result.index = date_index[-h:] return result else: t_params = self.latent_variables.get_z_values() a, P = self._forecast_model(t_params, Z, h) smoothed_series = np.zeros(h) for t in range(h): smoothed_series[t] = np.dot(Z[self.y.shape[0]+t],a[:,self.y.shape[0]+t]) # Retrieve data, dates and (transformed) latent variables forecasted_values = smoothed_series if intervals is False: result = pd.DataFrame(forecasted_values) result.rename(columns={0:self.data_name}, inplace=True) else: series_variance = np.zeros(h) for t in range(h): series_variance[t] = np.dot(np.dot(Z[self.y.shape[0]+t],P[:,:,self.y.shape[0]+t]),Z[self.y.shape[0]+t].T) prediction_05 = forecasted_values - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5) prediction_95 = forecasted_values + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5) prediction_01 = forecasted_values - 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5) prediction_99 = forecasted_values + 2.575*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5) result = pd.DataFrame([forecasted_values, prediction_01, prediction_05, prediction_95, prediction_99]).T result.rename(columns={0:self.data_name, 1: "1% Prediction Interval", 2: "5% Prediction Interval", 3: "95% Prediction Interval", 4: "99% Prediction Interval"}, inplace=True) result.index = date_index[-h:] return result
def plot_predict(self, h=5, past_values=20, intervals=True, oos_data=None, **kwargs): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? past_values : int (default : 20) How many past observations to show on the forecast graph? intervals : Boolean Would you like to show 95% prediction intervals for the forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - Plot of the forecast """ import matplotlib.pyplot as plt import seaborn as sns figsize = kwargs.get('figsize',(10,7)) nsims = kwargs.get('nsims', 200) if self.latent_variables.estimated is False: raise Exception("No latent variables estimated!") else: _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] full_X = self.X.copy() full_X = np.append(full_X,X_oos,axis=0) Z = full_X date_index = self.shift_dates(h) # Retrieve data, dates and (transformed) latent variables if self.latent_variables.estimation_method in ['M-H']: lower_final = 0 upper_final = 0 plot_values_final = 0 plot_index = date_index[-h-past_values:] for i in range(nsims): t_params = self.draw_latent_variables(nsims=1).T[0] a, P = self._forecast_model(t_params, Z, h) smoothed_series = np.zeros(self.y.shape[0]+h) series_variance = np.zeros(self.y.shape[0]+h) for t in range(self.y.shape[0]+h): smoothed_series[t] = np.dot(Z[t],a[:,t]) series_variance[t] = np.dot(np.dot(Z[t],P[:,:,t]),Z[t].T) plot_values = smoothed_series[-h-past_values:] lower = smoothed_series[-h:] - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5) upper = smoothed_series[-h:] + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(t_params[0]),0.5) lower_final += np.append(plot_values[-h-1], lower) upper_final += np.append(plot_values[-h-1], upper) plot_values_final += plot_values plot_values_final = plot_values_final / nsims lower_final = lower_final / nsims upper_final = upper_final / nsims plt.figure(figsize=figsize) if intervals == True: plt.fill_between(date_index[-h-1:], lower_final, upper_final, alpha=0.2) plt.plot(plot_index, plot_values_final) plt.title("Forecast for " + self.data_name) plt.xlabel("Time") plt.ylabel(self.data_name) plt.show() else: a, P = self._forecast_model(self.latent_variables.get_z_values(), h) plot_values = a[0][-h-past_values:] forecasted_values = a[0][-h:] smoothed_series = np.zeros(self.y.shape[0]+h) series_variance = np.zeros(self.y.shape[0]+h) for t in range(self.y.shape[0]+h): smoothed_series[t] = np.dot(Z[t],a[:,t]) series_variance[t] = np.dot(np.dot(Z[t],P[:,:,t]),Z[t].T) lower = forecasted_values - 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5) upper = forecasted_values + 1.96*np.power(P[0][0][-h:] + self.latent_variables.z_list[0].prior.transform(self.latent_variables.get_z_values()[0]),0.5) lower = np.append(plot_values[-h-1],lower) upper = np.append(plot_values[-h-1],upper) plot_index = date_index[-h-past_values:] plt.figure(figsize=figsize) if intervals == True: plt.fill_between(date_index[-h-1:], lower, upper, alpha=0.2) plt.plot(plot_index,plot_values) plt.title("Forecast for " + self.data_name) plt.xlabel("Time") plt.ylabel(self.data_name) plt.show()
foo = pd.read_csv("./data_vectorised/all_predictors_improved.csv") foo['RESULT'] = Series(data['type'], index=foo.index) foo['ID'] = Series(data['id'], index=foo.index) foo.to_csv('./data_vectorised/reducedVectorised.csv',sep=',', index=False) # # Logistic Regression # In[55]: data2 = pd.read_csv("./data_vectorised/reducedVectorised.csv") # In[57]: y, X = dmatrices("RESULT ~ flu + gett + im + shot + think + have + sick + feel + am + you + got + bett + worried + hope + today + vaccine + scared + week + has + back + home + might + worse + year + fev + she + already + try + they + bed + bug + symptom + dr + bit + care + weekend + hand + stomach + rest + old + hell + health + suck + us", data2, return_type = "dataframe") # flatten y into a 1-D array y = np.ravel(y) # instantiate a logistic regression model, and fit with X and y model = LogisticRegression() model = model.fit(X,y) # check the accuracy on the training set model.score(X, y) # In[ ]:
alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples plt.show() ################ Linear Regression ################ #target Y = rating #remove calories as it is a linear combinar of everything else #y, x = patsy.dmatrices("rating~protein+fat+sodium+fiber+carbo+sugars+potass+C(vitamins)+C(shelf)+cups+C(mfr)+C(cluster)",cereal) y, x = patsy.dmatrices("rating~protein+fat+sodium+fiber+carbo+sugars+potass", cereal) pca_regression = pca_cereals.copy() pca_regression.rename({ 0: "PC1", 1: "PC2", 2: "PC3", 3: "PC4" }, axis=1, inplace=True) pca_regression["rating"] = cereal["rating"] y, x = patsy.dmatrices("rating~protein+fat+sodium+fiber+carbo+sugars+potass",
def plot_predict(self, h=5, past_values=20, intervals=True, oos_data=None, **kwargs): """ Plots forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? past_values : int (default : 20) How many past observations to show on the forecast graph? intervals : Boolean Would you like to show prediction intervals for the forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - Plot of the forecast """ import matplotlib.pyplot as plt figsize = kwargs.get('figsize', (10, 7)) if self.latent_variables.estimated is False: raise Exception("No latent variables estimated!") else: # Retrieve data, dates and (transformed) latent variables _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] X_pred = X_oos[:h] lmda, Y, scores, theta = self._model( self.latent_variables.get_z_values()) date_index = self.shift_dates(h) t_params = self.transform_z() # Get mean prediction and simulations (for errors) mean_values = self._mean_prediction(lmda, Y, scores, h, t_params, X_pred) sim_values = self._sim_prediction(lmda, Y, scores, h, t_params, 15000, X_pred) error_bars, forecasted_values, plot_values, plot_index = self._summarize_simulations( mean_values, sim_values, date_index, h, past_values) plt.figure(figsize=figsize) if intervals == True: alpha = [0.15 * i / float(100) for i in range(50, 12, -2)] for count, pre in enumerate(error_bars): plt.fill_between(date_index[-h - 1:], np.exp((forecasted_values - pre) / 2), np.exp((forecasted_values + pre) / 2), alpha=alpha[count]) plt.plot(plot_index, np.exp(plot_values / 2.0)) plt.title("Forecast for " + self.data_name + " Conditional Volatility") plt.xlabel("Time") plt.ylabel(self.data_name) plt.show()
def rformula(df, formula): """ Split a data frame into X and y based on an R Formula. Based on patsy formulas. See https://patsy.readthedocs.io/en/latest/formulas.html for valid formulas. Returns ------- A tuple where the first element is a pandas DataFrame containing the independent variables, and the second is a pandas Series containing the dependent variable. Example ------- >>> df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9])) >>> X, y = df.rformula('a ~ b') >>> X b 0 4.0 1 5.0 2 6.0 >>> y a 0 1.0 1 2.0 2 3.0 >>> X, y = df.rformula('c ~ a + b') >>> X a b 0 1.0 4.0 1 2.0 5.0 2 3.0 6.0 >>> y c 0 7.0 1 8.0 2 9.0 >>> X, y = df.rformula('b ~ a + a:c') >>> X a a:c 0 1.0 7.0 1 2.0 16.0 2 3.0 27.0 >>> y b 0 4.0 1 5.0 2 6.0 >>> X, y = df.rformula('b ~ a*c') >>> X a c a:c 0 1.0 7.0 7.0 1 2.0 8.0 16.0 2 3.0 9.0 27.0 >>> y b 0 4.0 1 5.0 2 6.0 """ y, X = patsy.dmatrices(formula, df, return_type="dataframe") return X.drop(columns="Intercept"), y
plt.show() # 观察低等舱逃生情况 # In[ ]: lowclass = data.Survived[data.Pclass == 3].value_counts().sort_index() lowclass.plot(kind='bar', label='Highclass', color='Blue', alpha=0.6) plt.show() # dmatrices将数据中的离散变量变成哑变量,并指明用Pclass, Sex, Embarked来预测Survived # In[ ]: y, X = dmatrices('Survived~ C(Pclass) + C(Sex) + Age + C(Embarked)', data=data, return_type='dataframe') y = np.ravel(y) # In[ ]: model = LogisticRegression() # In[ ]: model.fit(X, y) # 输出训练准确率 # In[ ]:
from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.cross_validation import cross_val_score #loading data dta = sm.datasets.fair.load_pandas().data # add "affair" column: 1 represents having affairs, 0 represents not dta['affair'] = (dta.affairs > 0).astype(int) #Prepare Data for Logistic Regression #To prepare the data, I want to add an intercept column as well as dummy variables for occupation # and occupation_husb, since I'm treating them as categorial variables. #The dmatrices function from the patsy module can do that using formula language. y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \ religious + educ + C(occupation) + C(occupation_husb)', dta, return_type="dataframe") #rename the columns X = X.rename( columns={ 'C(occupation)[T.2.0]': 'occ_2', 'C(occupation)[T.3.0]': 'occ_3', 'C(occupation)[T.4.0]': 'occ_4', 'C(occupation)[T.5.0]': 'occ_5', 'C(occupation)[T.6.0]': 'occ_6', 'C(occupation_husb)[T.2.0]': 'occ_husb_2', 'C(occupation_husb)[T.3.0]': 'occ_husb_3', 'C(occupation_husb)[T.4.0]': 'occ_husb_4', 'C(occupation_husb)[T.5.0]': 'occ_husb_5', 'C(occupation_husb)[T.6.0]': 'occ_husb_6'
def tabulate_march_inequality(year): """ # For years 1964-2009 (year is March year, not earnings year), tabulate: These inequality metrics: - 90/50, 50/10, 90/10, Vln - 60/50, 70/50, 80/50, 95/50, 97/50 - 50/3, 50/5, 50/20, 50/30, 50/40 For these samples - Males - Females - Both For these wage measures - All hourly For these conditioning variables - raw wage inequality - residual wage inequality Also note: - Always dropping allocators where possible D. Autor, 2/24/2004 D. Autor, 6/15/2004 - Updated for consistency of controls for quantime simulation methods M. Anderson, 12/13/2005 - Updated for new quantiles and years D. Autor, 9/5/2006. Updated for 2005 March M. Wasserman, 10/14/2009 Updated for 2007/8 March # """ df = tabulate_march_basic(year) df = df.eval(""" lnwinc = log(winc_ws) + log(gdp) lnhinc = log(hinc_ws) + log(gdp) """) # Full-time and hourly samples df = df.eval("ftfy = fulltime*fullyear") df.ftfy.describe().to_frame().T df = df.eval(""" ftsamp = (lnwinc == lnwinc) * ftfy * abs(bcwkwgkm-1) hrsamp = (lnhinc == lnhinc) * abs(bchrwgkm-1) """) # @ ftsamp: weekly real wage not none + ftfy + above weekly real wage limit # @ hrsamp: hourly real wage not none + above hourly real wage limit df.loc[df.ftsamp == 0, "lnwinc"] = np.nan df.loc[df.hrsamp == 0, "lnhinc"] = np.nan df.query("ftsamp == 1")["lnwinc"].describe().to_frame().T df.query("hrsamp == 1")["lnhinc"].describe().to_frame().T df = df.query("ftsamp == 1 | hrsamp == 1") # Generate experience categories df = df.assign(expcat=(df.exp/3).astype(int) + 1) df.loc[df.expcat == 17, "expcat"] = 16 assert df.eval("1<= expcat <= 16").all() df.groupby("expcat")["exp"].agg(["mean", "min", "max"]) # interaction terms - 80 of these # @ move to residual wage part # Drop reference group's interaction term: HSG with 0-2 years of experience # @ simiarly skip now df = df.filter(["year", "wgt", "wgt_hrs", "female", "lnwinc", "lnhinc", "hrsamp", "ftsamp", "edcat", "expcat"]) ###################################################################### # Summarize raw inequality ###################################################################### pctiles = pd.Series([3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 97]) pctiles_ = pctiles / 100 tot_pct = pd.DataFrame(index=pctiles) tot_stat = pd.DataFrame(index=["mn", "vln"]) dt = df.query("ftsamp==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_mf"] = [wq.mean, wq.var] dt = df.query("ftsamp==1 & female==0") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_m"] = [wq.mean, wq.var] dt = df.query("ftsamp==1 & female==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_ft_f"] = [wq.mean, wq.var] dt = df.query("hrsamp==1") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_mf"] = [wq.mean, wq.var] dt = df.query("hrsamp==1 & female==0") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_m"] = [wq.mean, wq.var] dt = df.query("hrsamp==1 & female==1") wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs) tot_pct["tot_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False) tot_stat["tot_hr_f"] = [wq.mean, wq.var] df_stat = pd.concat([tot_stat, tot_pct], axis=0, sort=False) ###################################################################### # Summarize residual inequality - Weekly & Hourly ###################################################################### res_pct = pd.DataFrame(index=pctiles) res_stat = pd.DataFrame(index=["mn", "vln"]) dt = df.query("ftsamp==1") y, X = dmatrices('lnwinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_mf"] = [wq.mean, wq.var] # @ mean is not necessary but to be consistent res_pct["res_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==0") y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_m"] = [wq.mean, wq.var] res_pct["res_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==1") y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt) res_stat["res_ft_f"] = [wq.mean, wq.var] res_pct["res_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1") y, X = dmatrices('lnhinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_mf"] = [wq.mean, wq.var] res_pct["res_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1 & female==0") y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_m"] = [wq.mean, wq.var] res_pct["res_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("hrsamp==1 & female==1") y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe") X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1)) res = sm.WLS(y, X, weights=dt.wgt_hrs).fit() resid = res.resid wq = DescrStatsW(data=resid, weights=dt.wgt_hrs) res_stat["res_hr_f"] = [wq.mean, wq.var] res_pct["res_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False) df_stat_ = pd.concat([res_stat, res_pct], axis=0) df_stat = pd.concat([df_stat, df_stat_], axis=1) # march-ineq-data-`1' df_stat = df_stat.T.rename_axis('sample').reset_index().assign(year=year) # @ tidy data ###################################################################### # Percentiles of weekly earnings ###################################################################### # @ simply generate more percentiles under full-time samples # @ note here year is march census year thus minus one to be earnings year pctiles = pd.Series(range(3, 98)) pctiles_ = pctiles / 100 tot_pct = pd.DataFrame(index=pctiles) dt = df.query("ftsamp==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==0") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False) dt = df.query("ftsamp==1 & female==1") wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt) tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False) # march-pctile-`yr' tot_pct = tot_pct.T.rename_axis('sample').reset_index().assign(year=year-1) # @ tidy data # @ the code then combine 1963-2008 generated files # @ we remove this as not sure necessary # @ actually this part can be combined with #Summarize raw inequality# return df_stat, tot_pct
def __init__(self, formula, data): family = Gaussian() smooth_info = parse_smooths(formula, data) formula = get_parametric_formula(formula) y, Xp = patsy.dmatrices(formula, data, return_type='dataframe', eval_env=1) varnames = Xp.columns.tolist() smooths = {} start = p = Xp.shape[1] ns = 0 for key, val in smooth_info.items(): slist = get_smooth(**val) if len(slist) == 1: smooths[key], = slist p_i = smooths[key]['X'].shape[1] varnames += [f"{key}{j}" for j in range(1, p_i + 1)] p += p_i ns += 1 else: for i, x in enumerate(slist): by_key = f"{key}_{x['by_cat']}" smooths[by_key] = x p_i = x['X'].shape[1] varnames += [f"{by_key}_{j}" for j in range(1, p_i + 1)] p += p_i ns += 1 X, S, Sj, ranks, ldS = [Xp], np.zeros((ns, p, p)), [], [], [] for i, (var, s) in enumerate(smooths.items()): p_i = s['X'].shape[1] Si, ix = np.zeros((p, p)), np.arange(start, start + p_i) start += p_i Si[ix, ix.reshape(-1, 1)] = s['S'] smooths[var]['ix'], smooths[var]['Si'] = ix, Si X.append(smooths[var]['X']) S[i] = Si Sj.append(s['S']) ranks.append(np.linalg.matrix_rank(Si)) u = np.linalg.eigvals(s['S']) ldS.append(np.log(u[u > np.finfo(float).eps]).sum()) self.X, self.Xp, self.y = np.concatenate( X, axis=1), Xp.values, y.values[:, 0] self.S, self.Sj, self.ranks, self.ldS = S, Sj, ranks, ldS self.f, self.smooths = family, smooths self.ns, self.n_obs, self.nx = ns, self.X.shape[0], self.X.shape[1] self.mp = self.nx - np.sum(self.ranks) self.data = data theta = np.zeros(self.ns + 1) for i, (var, s) in enumerate(smooths.items()): ix = smooths[var]['ix'] a = self.S[i][ix, ix[:, None].T] d = np.diag(self.X[:, ix].T.dot(self.X[:, ix])) lam = (1.5 * (d / a)[a > 0]).mean() theta[i] = np.log(lam) varnames += [f"log_smooth_{var}"] theta[-1] = 1.0 varnames += ["log_scale"] self.theta = theta self.varnames = varnames self.smooth_info = smooth_info
import statsmodels.formula.api as smf from patsy import dmatrices from sklearn.feature_selection import SelectKBest, f_classif, f_regression from sklearn.linear_model import Ridge from sklearn.preprocessing import scale from sklearn.linear_model import Lasso df = pd.read_csv('Hitters.csv').dropna() #remove unusable data df.iloc[:, 1:19].drop('League', 1).drop('Division', 1) #split data train, test = np.split(df.sample(frac=1), [int(0.5 * len(df))]) formula = 'Salary~AtBat+Hits+HmRun+Runs+RBI+Walks+Years+CAtBat+CHits+CHmRun+CRuns+CRBI+CWalks+PutOuts+Assists+Errors' y_train, x_train = dmatrices(formula, train, return_type='dataframe') y_test, x_test = dmatrices(formula, test, return_type='dataframe') #our two methods ridge = Ridge() lasso = Lasso(max_iter=10000) alphas_selected = [0.001, 0.01, 0.1, 1, 10, 100, 1000] for a in alphas_selected: ridge.set_params(alpha=a) lasso.set_params(alpha=a) ridge.fit(scale(x_train), scale(y_train)) lasso.fit(scale(x_train), scale(y_train)) preds_ridge = ridge.predict(x_train) preds_lasso = lasso.predict(x_train) print('MSE RIDGE alpha=', a, ":",
def vif(dt, y, x=None, merge_coef=False, positive="bad|1"): ''' Variance Inflation Factors ------ vif calculates variance-inflation factors for logistic regression. Params ------ dt: A data frame with both x (predictor/feature) and y (response/label) variables. y: Name of y variable. x: Name of x variables. Default is None. If x is None, then all variables except y are counted as x variables. merge_coef: Logical, whether to merge with coefficients of model summary matrix. Defaults to FALSE. positive: Value of positive class, default "bad|1". Returns ------ data frame A data frame with columns for variable and gvif. Examples ------ import scorecardpy as sc # load data dat = sc.germancredit() # Example I sc.vif(dat, y = 'creditability', x=['age_in_years', 'credit_amount', 'present_residence_since'], merge_coef=True) ''' dt = dt.copy(deep=True) if isinstance(y, str): y = [y] if isinstance(x, str) and x is not None: x = [x] if x is not None: dt = dt[y + x] # check y dt = check_y(dt, y, positive) # x variables x = x_variable(dt, y, x) # dty, dtx ytrain = dt.loc[:, y] Xtrain = dt.loc[:, x] Xtrain = sm.add_constant(Xtrain) # logistic regression lrfit = sm.GLM(ytrain.astype(float), Xtrain.astype(float), family=sm.families.Binomial()).fit() # vif dty, dtX = dmatrices(' ~ '.join([y[0], '+'.join(x)]), data=dt, return_type="dataframe") dfvif = pd.DataFrame({ 'variables': ['const', 'age_in_years', 'credit_amount', 'present_residence_since'], 'vif': [ variance_inflation_factor(dtX.values, i) for i in range(dtX.shape[1]) ] }) # merge with coef if merge_coef: dfvif = pd.merge(lrfit.summary2().tables[1].reset_index().rename( columns={'index': 'variables'}), dfvif, on='variables', how='outer') return dfvif
PCS = [1, 2, 3] pdf = PdfPages(os.path.join(OUTPUT, "pc_clinic_associations.pdf")) for target in targets: #target = 'MMSE' #target = 'TMTB' dt = data[data[target].notnull()] y = dt[target] fig, axarr = plt.subplots(1, 3) #, sharey=True) fig.set_figwidth(15) print fig.get_figwidth() for j, pc in enumerate(PCS): #j, pc = 2, 3 # -------------------------------- model = '%s~PC%s+AGE_AT_INCLUSION+SEX+EDUCATION+BPF+LLV' % (target, pc) # -------------------------------- y, X = dmatrices(model, data=dt, return_type='dataframe') mod = sm.OLS(y, X).fit() test = mod.t_test([0, 1] + [0] * (X.shape[1] - 2)) tval, pval = test.tvalue[0, 0], test.pvalue[0, 0] x = dt["PC%i" % pc] axarr[j].scatter(x, y) if False: for i in xrange(len(dt['Subject ID'])): axarr[j].text(dt.ix[i, "PC%i" % pc], y.ix[i, 0], dt['Subject ID'][i]) x_ext = np.array([x.min(), x.max()]) y_ext = x_ext * mod.params[1] + y.mean().values #mod.params[0] axarr[j].plot(x_ext, y_ext, "red") if j == 0: axarr[j].set_ylabel(target) axarr[j].set_xlabel('PC%i (T=%.3f, P=%.4g)' % (pc, tval, pval))
def plot_predict(self, h=5, past_values=20, intervals=True, oos_data=None, **kwargs): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? past_values : int (default : 20) How many past observations to show on the forecast graph? intervals : Boolean Would you like to show 95% prediction intervals for the forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - Plot of the forecast """ figsize = kwargs.get('figsize', (10, 7)) if self.parameters.estimated is False: raise Exception("No parameters estimated!") else: # Sort/manipulate the out-of-sample data _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] full_X = self.X.copy() full_X = np.append(full_X, X_oos, axis=0) Z = full_X # Retrieve data, dates and (transformed) parameters a, P = self._forecast_model(self.parameters.get_parameter_values(), Z, h) smoothed_series = np.zeros(self.y.shape[0] + h) series_variance = np.zeros(self.y.shape[0] + h) for t in range(self.y.shape[0] + h): smoothed_series[t] = np.dot(Z[t], a[:, t]) series_variance[t] = np.dot( np.dot(Z[t], P[:, :, t]), Z[t].T ) + self.parameters.parameter_list[0].prior.transform( self.parameters.get_parameter_values()[0]) date_index = self.shift_dates(h) plot_values = smoothed_series[-h - past_values:] forecasted_values = smoothed_series[-h:] lower = forecasted_values - 1.98 * np.power( series_variance[-h:], 0.5) upper = forecasted_values + 1.98 * np.power( series_variance[-h:], 0.5) lower = np.append(plot_values[-h - 1], lower) upper = np.append(plot_values[-h - 1], upper) plot_index = date_index[-h - past_values:] plt.figure(figsize=figsize) if intervals == True: plt.fill_between(date_index[-h - 1:], lower, upper, alpha=0.2) plt.plot(plot_index, plot_values) plt.title("Forecast for " + self.y_name) plt.xlabel("Time") plt.ylabel(self.y_name) plt.show()
def __init__(self, data, formula, ar, ma, integ=0, family=fam.Normal()): # Initialize TSM object super(ARIMAX,self).__init__('ARIMAX') # Latent Variables self.ar = ar self.ma = ma self.integ = integ self.z_no = self.ar + self.ma + 2 self.max_lag = max(self.ar, self.ma) self._z_hide = 0 # Whether to cutoff latent variables from results table self.supported_methods = ["MLE", "PML", "Laplace", "M-H", "BBVI"] self.default_method = "MLE" self.multivariate_model = False # Format the data self.is_pandas = True # This is compulsory for this model type self.data_original = data.copy() self.formula = formula self.y, self.X = dmatrices(formula, data) self.y_name = self.y.design_info.describe() self.X_names = self.X.design_info.describe().split(" + ") self.y = self.y.astype(np.float) self.X = self.X.astype(np.float) self.z_no = self.X.shape[1] self.data_name = self.y_name self.y = np.array([self.y]).ravel() self.data = self.y.copy() self.X = np.array([self.X])[0] self.index = data.index # Difference data for order in range(0, self.integ): self.y = np.diff(self.y) self.data = np.diff(self.data) self.data_name = "Differenced " + self.data_name self.data_length = self.data.shape[0] self.ar_matrix = self._ar_matrix() self._create_latent_variables() self.family = family self.model_name2, self.link, self.scale, self.shape, self.skewness, self.mean_transform, self.cythonized = self.family.setup() self.model_name = self.model_name2 + " ARIMAX(" + str(self.ar) + "," + str(self.integ) + "," + str(self.ma) + ")" # Build any remaining latent variables that are specific to the family chosen for no, i in enumerate(self.family.build_latent_variables()): self.latent_variables.add_z(i[0], i[1], i[2]) self.latent_variables.z_list[no+self.ar+self.ma+self.X.shape[1]].start = i[3] self.family_z_no = len(self.family.build_latent_variables()) self.z_no = len(self.latent_variables.z_list) # If Normal family is selected, we use faster likelihood functions if isinstance(self.family, fam.Normal): self._model = self._normal_model self._mb_model = self._mb_normal_model self.neg_loglik = self.normal_neg_loglik self.mb_neg_loglik = self.normal_mb_neg_loglik else: self._model = self._non_normal_model self._mb_model = self._mb_non_normal_model self.neg_loglik = self.non_normal_neg_loglik self.mb_neg_loglik = self.non_normal_mb_neg_loglik
# ['scale(np.log(active_methods +0.5))'], # ['scale(np.log(query_commits+0.5))'], ] params = [] pvalues = [] prsquared = [] factors = [] vifmax = 0.0 corrmax = 0.0 mdls = [] for add in models: factors = factors + add y, X = patsy.dmatrices('is_sparql~ ' + reduce(lambda x, y: x + ' + ' + y, factors), data=repositories, return_type='dataframe') mod = sm.OLS(y, X) res = mod.fit() mdls.append(res) print(res.summary()) params.append(res.params) pvalues.append(res.pvalues) prsquared.append(res.rsquared) vif = pd.DataFrame() vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] vif["features"] = X.columns corr = X.corr()
test_df = pd.read_csv(test_path) mean_age = np.mean(train_df.Age) train_df.Age = train_df.Age.fillna(mean_age) test_df.Age = test_df.Age.fillna(mean_age) test_df["Survived"] = 0 test_passengers = test_df.PassengerId.values formula_ml = "Survived ~ C(Pclass) + C(Sex) + Age + SibSp + Parch + C(Embarked)" #formula_ml = "Survived ~ C(Sex)" results = {} train_y, train_x = dmatrices(formula_ml, data=train_df, return_type="dataframe") test_y, test_x = dmatrices(formula_ml, data=test_df, return_type="dataframe") #train_y = np.asarray(train_y).ravel() #Logistic Regression model = sm.Logit(train_y, train_x) res = model.fit() output = res.predict(test_x) output = np.asanyarray(output).ravel() output = np.round(output) output_file = open("myLogit2.csv", "wb") output_file_object = csv.writer(output_file) output_file_object.writerow(["PassengerId", "Survived"]) output_file_object.writerows(zip(test_passengers, output))
import matplotlib.pyplot as plt import seaborn as sns import scipy from patsy import dmatrices import statsmodels.api as sm from sessionData import session matplotlib.rcParams['pdf.fonttype'] = 42 d = import_data() df = create_df(d) maxResp = d['maxResponseWaitFrames'][()] y, X = dmatrices('resp ~ rewDir + mask', data=df, return_type='dataframe') mod = sm.OLS(y, X) res = mod.fit() res.summary() #onehotencode the categorical variables (resp, dir)) from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer #List of (name, transformer, column(s)) tuples ct = ColumnTransformer([('encode1', OneHotEncoder(df), 'rewDir'), ('encode2', OneHotEncoder(df), 'resp')], remainder='passthrough')
axis=1) # Take differences between following month and the current month monthly_final_df['difflanefrac'] = monthly_final_df['nextlanefrac'].subtract( monthly_final_df['lanefrac']) monthly_final_df['diffcount'] = monthly_final_df['nextcount'].subtract( monthly_final_df['count']) monthly_final_df['difftotaltrips'] = monthly_final_df[ 'nexttotaltrips'].subtract(monthly_final_df['totaltrips']) pickle.dump( monthly_final_df, open( "C:/Users/fhp7/Desktop/Cornell/CEE 4620/Final Project/Model/monthly_final_df.p", 'wb')) #%% Perform regression on difference data print("Performing Regression") # Remove all records that have no infrastructure change regress_df = monthly_final_df.loc[(monthly_final_df.difflanefrac != 0) & \ (pd.notnull(monthly_final_df.difflanefrac))] regress_df['logdifflanefrac'] = regress_df['difflanefrac'].apply(np.log) # Use patsy to generate design matrix and target vector y, X = dmatrices('diffcount ~ difflanefrac + difftotaltrips', data=regress_df, return_type='dataframe') # Fit the model using statsmodels and print the results result = sm.OLS(y, X).fit() print(result.summary())
########################################loading################################ #read data traindf = pd.read_csv(train_file) ##clean data df = clean_and_munge_data(traindf) # ## Part 3: Creating a Random Forest Classifier with Cross Validation ## # In[ ]: ########################################formula################################ formula_ml = 'Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size' y_train, x_train = dmatrices(formula_ml, data=df, return_type='dataframe') y_train = np.asarray(y_train).ravel() print(y_train.shape, x_train.shape) ##select a train and test set X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed) #instantiate and fit our model clf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=5, min_samples_split=2,
def Problem_7(): df = pd.DataFrame(data = {'y': # Miles/gal [18.90, 17.00, 20.00, 18.25, 20.07, 11.20, 22.12, 21.47, 34.70, 30.40, 16.50, 36.50, 21.50, 19.70, 20.30, 17.80, 14.39, 14.89, 17.80, 16.41, 23.54, 21.47, 16.59, 31.90, 29.40, 13.27, 23.90, 19.73, 13.90, 13.27, 13.77, 16.50], 'X1': # Displacement (in^3) [350, 350, 250, 351, 225, 440, 231, 262, 89.7, 96.9, 350, 85.3, 171, 258, 140, 302, 500, 440, 350, 318, 231, 360, 400, 96.9, 140, 460, 133.6, 318, 351, 351, 360, 350], 'X2': # Weight (lbs) [3910, 3860, 3510, 3890, 3365, 4215, 3020, 3180, 1905, 2320, 3885, 2009, 2655, 3375, 2700, 3890, 5290, 5185, 3910, 3660, 3050, 4250, 3850, 2275, 2150, 5430, 2535, 4370, 4540, 4715, 4215, 3660]}, index = ['Apollo', 'Omega', 'Nova', 'Monarch', 'Duster', 'Jenson Conv.', 'Skyhawk', 'Monza', 'Scirocco', 'Corolla SR-5', 'Camaro', 'Datsun B210', 'Capri II', 'Pacer', 'Bobcat', 'Granada', 'Eldorado', 'Imperial', 'Nova LN', 'Valiant', 'Starfire', 'Cordoba', 'Trans Am', 'Corolla E-5', 'Astre', 'Mark IV', 'Celica GT', 'Charger SE', 'Cougar', 'Elite', 'Matador', 'Corvette']) ############### # Problem 7.a # ############### title_print('Problem 7.a') y, X = patsy.dmatrices('y ~ X1 + X2', df) model = sm.OLS(y, X) results = model.fit() results.model.data.design_info = X.design_info print('> y = {} + {} * x1 + {} * x2 + e <'.format( round(results.params[0], 3), round(results.params[1], 3), round(results.params[2], 3)).center(80, '-')) ############### # Problem 7.b # ############### title_print('Problem 7.b') aov_table = sm.stats.anova_lm(results, typ = 1) print('\n--- Analysis of Variance table ---\n{}'.format(aov_table)) print('\nRegression F: {}'.format(round(results.fvalue, 2))) print('Regression p: {}\n'.format(round(results.f_pvalue, 4))) print('> Based on P-values, X1 is significant, X2 is not <'.\ center(80, '-')) ############### # Problem 7.c # ############### title_print('Problem 7.c') print('> R-squared explains {}% of total variability <'.\ format(round(results.rsquared * 100, 2)).center(80, '-')) ############### # Problem 7.d # ############### title_print('Problem 7.d') conf_int = np.round(results.conf_int(), 5) print('> 95% Confidence Intervals <'.center(80, '-')) print('> Intercept: {} <'.format(conf_int[0]).center(80, '-')) print('> B1: {} <'.format(conf_int[1]).center(80, '-')) print('> B2: {} <'.format(conf_int[2]).center(80, '-')) print('> 95% confident respective slopes are between these values <'.\ center(80, '-')) ############### # Problem 7.e # ############### title_print('Problem 7.e') intervals = np.round(results.get_prediction([1, 275, 3000]).\ summary_frame(alpha = 0.05), 4) print('> 95% Confidence Interval <'.center(80, '-')) print('> {} to {} <'.format(intervals['mean_ci_lower'].values, intervals['mean_ci_upper'].values).\ center(80, '-')) print('> 95% confident interval contains true mean <'.center(80, '-')) ############### # Problem 7.f # ############### title_print('Problem 7.f') print('> 95% Prediction Interval <'.center(80, '-')) print('> {} to {} <'.format(intervals['obs_ci_lower'].values, intervals['obs_ci_upper'].values).\ center(80, '-')) print('> 95% confident interval contains prediction <'.center(80, '-')) ############### # Problem 7.g # ############### title_print('Problem 7.g') print('> Prediction interval is wider <'.center(80, '-')) print('> More uncertainty when making single/specific prediction <'.\ center(80, '-')) ################# # Problem 7.h.1 # ################# title_print('Problem 7.h.1') residuals = results.resid prob = [(i - 1/2) / len(y) for i in range(len(y))] # Can plot straight line for visuals resid_results = sm.OLS(prob, sm.add_constant(sorted(residuals))).fit() X_range = np.linspace(min(residuals), max(residuals), len(residuals)) # Normal Probability Plot + straight line fig, ax = plt.subplots() ax.scatter(sorted(residuals), prob) ax.plot(X_range, resid_results.params[0] + resid_results.params[1] * X_range) ax.set_xlabel('Residual') ax.set_ylabel('Probability') ax.set_ylim(0, 1) plt.title('Normal Probability Plot') plt.show() print('> Does not appear to be problem with normality <'.center(80, '-')) ################# # Problem 7.h.2 # ################# title_print('Problem 7.h.2') fig, ax = plt.subplots() ax.scatter(results.fittedvalues, residuals) ax.axhline(0) ax.set_xlabel('Fitted Values') ax.set_ylabel('Residuals') plt.title('Residuals Versus Predicted Response') plt.show() print('> Definite non-linear pattern. Either slight downward trend <'.\ center(80, '-')) print('> if you disregard 5 points in upper right. OR somewhat <'.\ center(80, '-')) print('> quadratic if disregard 3 points in lower right <'.center(80, '-')) ################# # Problem 7.h.3 # ################# title_print('Problem 7.h.3') fig, ax = plt.subplots() ax2 = ax.twiny() scat_1 = ax.plot(df['X1'], residuals, marker = '*', linestyle = '', color = 'orange', label = 'X1') scat_2 = ax2.plot(df['X2'], residuals, marker = 'o', linestyle = '', color = 'black', label = 'X2') ax.axhline(0) ax.set_xlabel('X_1') ax2.set_xlabel('X_2') ax.set_ylabel('Residuals') plots = scat_1 + scat_2 labels = [label.get_label() for label in plots] ax.legend(plots, labels, loc = 'lower right') plt.title('Residuals Versus X_i') plt.show() print('> One y value plotted for each X-value <'.center(80, '-')) print('> Non-linear pattern trends to upper right <'.center(80, '-')) return df, results
import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix from datetime import datetime import quandl df = pd.read_csv(r"data_third.csv") del df['Split Ratio'] df['Date'] = pd.to_datetime(df['Date']) df['Date'] = df['Date'] - df['Date'][0] df['Date'] = df['Date'].dt.days dfn = (df - df.mean()) / (df.max() - df.min())#normalization pt_y, pt_x = pt.dmatrices("Close ~ Open", dfn) res = np.linalg.lstsq(pt_x, pt_y) b0 = res[0].ravel() print ("Close ~ Open ", b0) """ ax = plt.subplot() scatter_matrix(df, alpha=0.05, figsize=(10, 10), marker ='x') ax.plot(dfn['Close'], dfn['Open'], 'go', color = 'blue')#x[x] axis_x = np.linspace(-1, 1, 100) f = b0[0] + b0[1] * axis_x ax.plot(axis_x, f, color = 'red') """ ax = plt.subplot() plt.scatter(dfn['Date'], dfn['Close'], c='blue', s=20, label='blue',
def plot_predict(self, h=5, past_values=20, intervals=True, oos_data=None, **kwargs): """ Plots forecasts with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? past_values : int (default : 20) How many past observations to show on the forecast plot? intervals : Boolean Would you like to show prediction intervals for the forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) Returns ---------- - Plot of the forecast """ import matplotlib.pyplot as plt import seaborn as sns figsize = kwargs.get('figsize',(10,7)) if self.latent_variables.estimated is False: raise Exception("No latent variables estimated!") else: _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] X_pred = X_oos[:h] # Retrieve data, dates and (transformed) latent variables mu, Y = self._model(self.latent_variables.get_z_values()) date_index = self.shift_dates(h) if self.latent_variables.estimation_method in ['M-H']: sim_vector = self._sim_prediction_bayes(h, X_pred, 15000) error_bars = [] for pre in range(5,100,5): error_bars.append(np.insert([np.percentile(i,pre) for i in sim_vector], 0, Y[-1])) forecasted_values = np.insert([np.mean(i) for i in sim_vector], 0, Y[-1]) plot_values = np.append(Y[-1-past_values:-2], forecasted_values) plot_index = date_index[-h-past_values:] else: t_z = self.transform_z() mean_values = self._mean_prediction(mu, Y, h, t_z, X_pred) if self.model_name2 == "Skewt": model_scale, model_shape, model_skewness = self._get_scale_and_shape(t_z) m1 = (np.sqrt(model_shape)*sp.gamma((model_shape-1.0)/2.0))/(np.sqrt(np.pi)*sp.gamma(model_shape/2.0)) forecasted_values = mean_values[-h:] + (model_skewness - (1.0/model_skewness))*model_scale*m1 else: forecasted_values = mean_values[-h:] if intervals is True: sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 15000) else: sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 2) error_bars, forecasted_values, plot_values, plot_index = self._summarize_simulations(mean_values, sim_values, date_index, h, past_values) plt.figure(figsize=figsize) if intervals == True: alpha =[0.15*i/float(100) for i in range(50,12,-2)] for count, pre in enumerate(error_bars): plt.fill_between(date_index[-h-1:], error_bars[count], error_bars[-count-1],alpha=alpha[count]) plt.plot(plot_index,plot_values) plt.title("Forecast for " + self.data_name) plt.xlabel("Time") plt.ylabel(self.data_name) plt.show()
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env): rerp_infos = [] rerp_names = set() spans = [] design_offset = 0 expanded_design_offset = 0 data_format = data_set.data_format for rerp_idx, rerp_spec in enumerate(rerp_specs): start_offset = data_format.ms_to_samples(rerp_spec.start_time) # Offsets are half open: [start, stop) # But, it's more intuitive for times to be closed: [start, stop] # So we interpret the user times as a closed interval, and add 1 # sample when converting to offsets. stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time) if start_offset >= stop_offset: raise ValueError("Epochs must be >1 sample long!") event_set = data_set.events.find(rerp_spec.event_query) # Tricky bit: the specifies a RHS-only formula, but really we have an # implicit LHS (determined by the event_query). This makes things # complicated when it comes to e.g. keeping track of which items # survived NA removal, determining the number of rows in an # intercept-only formula, etc. Really we want patsy to just treat all # this stuff the same way as it normally handles a LHS~RHS # formula. So, we convert our RHS formula into a LHS~RHS formula, # using a special LHS that represents each event by a placeholder # integer! desc = ModelDesc.from_formula(rerp_spec.formula, eval_env) if desc.lhs_termlist: raise ValueError("Formula cannot have a left-hand side") desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])] fake_lhs, design = dmatrices(desc, event_set) surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel() design_row_idxes = np.empty(len(event_set)) design_row_idxes.fill(-1) design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0]) # Now design_row_idxes[i] is -1 if event i was thrown out, and # otherwise gives the row in 'design' which refers to event 'i'. for i in xrange(len(event_set)): event = event_set[i] # -1 for non-existent design_row_idx = design_row_idxes[i] recspan = (event.recording, event.span_id) recspan_intern = recspan_intern_table[recspan] epoch_start = start_offset + event.start_idx epoch_stop = stop_offset + event.start_idx if design_row_idx == -1: design_row = None else: design_row = design[design_row_idx, :] epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row, design_offset, expanded_design_offset, rerp_idx, []) if design_row is None: # Event thrown out due to missing predictors; this # makes its whole epoch into an artifact -- but if overlap # correction is disabled, then this artifact only affects # this epoch, not anything else. (We still want to treat # it as an artifact though so we get proper accounting at # the end.) epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR") spans.append( DataSpan((recspan_intern, epoch_start), (recspan_intern, epoch_stop), epoch, None)) if rerp_spec.name in rerp_names: raise ValueError("name %r used for two different sub-analyses" % (rerp_spec.name, )) rerp_names.add(rerp_spec.name) rerp_info = { "spec": rerp_spec, "design_info": design.design_info, "start_offset": start_offset, "stop_offset": stop_offset, "design_offset": design_offset, "expanded_design_offset": expanded_design_offset, "total_epochs": len(event_set), "epochs_with_data": 0, "epochs_with_artifacts": 0, } rerp_infos.append(rerp_info) design_offset += design.shape[1] epoch_samples = stop_offset - start_offset expanded_design_offset += epoch_samples * design.shape[1] return rerp_infos, spans, design_offset, expanded_design_offset
def predict(self, h=5, oos_data=None, intervals=False): """ Makes forecast with the estimated model Parameters ---------- h : int (default : 5) How many steps ahead would you like to forecast? oos_data : pd.DataFrame Data for the variables to be used out of sample (ys can be NaNs) intervals : boolean (default: False) Whether to return prediction intervals Returns ---------- - pd.DataFrame with predicted values """ if self.latent_variables.estimated is False: raise Exception("No latent variables estimated!") else: _, X_oos = dmatrices(self.formula, oos_data) X_oos = np.array([X_oos])[0] X_pred = X_oos[:h] # Retrieve data, dates and (transformed) latent variables mu, Y = self._model(self.latent_variables.get_z_values()) date_index = self.shift_dates(h) if self.latent_variables.estimation_method in ['M-H']: sim_vector = self._sim_prediction_bayes(h, X_pred, 15000) forecasted_values = np.array([np.mean(i) for i in sim_vector]) prediction_01 = np.array([np.percentile(i, 1) for i in sim_vector]) prediction_05 = np.array([np.percentile(i, 5) for i in sim_vector]) prediction_95 = np.array([np.percentile(i, 95) for i in sim_vector]) prediction_99 = np.array([np.percentile(i, 99) for i in sim_vector]) else: t_z = self.transform_z() mean_values = self._mean_prediction(mu, Y, h, t_z, X_pred) if self.model_name2 == "Skewt": model_scale, model_shape, model_skewness = self._get_scale_and_shape(t_z) m1 = (np.sqrt(model_shape)*sp.gamma((model_shape-1.0)/2.0))/(np.sqrt(np.pi)*sp.gamma(model_shape/2.0)) forecasted_values = mean_values[-h:] + (model_skewness - (1.0/model_skewness))*model_scale*m1 else: forecasted_values = mean_values[-h:] if intervals is True: sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 15000) else: sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 2) if intervals is False: result = pd.DataFrame(forecasted_values) result.rename(columns={0:self.data_name}, inplace=True) else: # Get mean prediction and simulations (for errors) if self.latent_variables.estimation_method not in ['M-H']: sim_values = self._sim_prediction(mu, Y, h, t_z, X_pred, 15000) prediction_01 = np.array([np.percentile(i, 1) for i in sim_values]) prediction_05 = np.array([np.percentile(i, 5) for i in sim_values]) prediction_95 = np.array([np.percentile(i, 95) for i in sim_values]) prediction_99 = np.array([np.percentile(i, 99) for i in sim_values]) result = pd.DataFrame([forecasted_values, prediction_01, prediction_05, prediction_95, prediction_99]).T result.rename(columns={0:self.data_name, 1: "1% Prediction Interval", 2: "5% Prediction Interval", 3: "95% Prediction Interval", 4: "99% Prediction Interval"}, inplace=True) result.index = date_index[-h:] return result
import seaborn as sb sb.pairplot(df[['a','b','c']]) #绘制统计图观察a,b,c列之间的关系 import statsmodels.api as sm #拟合基础流程 df['intercept'] = 1 #设置截距列 lm = sm.OLS(df['y'],df[['intercept','x']]) #设置数据集中的自变量和因变量(最小二乘法) results = lm.fit() #拟合模型并储存 results.summary() #查看摘要 #转换虚拟变量 df[['A','B','C']] = pd.get_dummies(df['a'])#转换a列虚拟变量设置入新列ABC lm=sm.Logit(df['y'],df[['intercept','x']]) #逻辑回归中不使用最小二乘法 results = lm.fit() #拟合模型并储存 results.summary2() #查看摘要 #计算vif值 from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor y,x=dmatrices('price ~ area + bedrooms + bathrooms',df,return_type='dataframe') vif=pd.DataFrame() vif['VIF Factor']=[variance_inflation_factor(x.values,i) for i in range(x.shape[1])] vif['features']=x.columns import sklearn.preprocessing as p p.scale(df['a']) #获取a列的缩放特征(减去均值并除以标准差)
for e in dy.Exon.unique(): ident.append(exoncode[e]) exog_vc.append((dy.Exon == e).astype(np.int)) for p in dy.Person.unique(): ident.append(4) exog_vc.append((dy.Person == p).astype(np.int)) for s in dy.Sample.unique(): ident.append(5) exog_vc.append((dy.Sample == s).astype(np.int)) exog_vc = np.vstack(exog_vc).T ident = np.asarray(ident) endog, exog = patsy.dmatrices( fml, data=dy, return_type='dataframe') vcp_names = [ "Gene(Mat)", "Gene(Pat)", "Exon(Mat)", "Exon(Pat)", "Person", "Sample" ] model = BinomialBayesMixedGLM( endog, exog, exog_vc, ident, vcp_p=3, fe_p=3, vcp_names=vcp_names) if kc != 3: model2 = BinomialBayesMixedGLM.from_formula(
def get_design_matrices(df, dependent_variable, independent_variables, interactions=[]): patsy_model = create_patsy_model(dependent_variable, independent_variables, interactions=interactions) y, X = dmatrices(patsy_model, df, return_type='dataframe') return (y, X)