def __init__(self, formula=None, data=None, **kwargs): # convert all variables raised to a power to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) if type(data) == pd.DataFrame: power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)', formula))) for var in power_vars: data[var] = data[var].astype('float64') if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = GLM(y, X, family=Binomial(), **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._link = logit else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._link = logit
def test_binomial(self): model = BinomialRegressor() y = Binomial().fitted(self.eta) model.fit(self.X, y) y_hat = model.predict(self.X) diff = y_hat - y rsq = 1 - np.mean(diff**2) / np.mean((y-np.mean(y))**2) assert_true(rsq > .9)
def irls_incremental(filename, formula, chunksize, family=Binomial(), link=Logit(), maxit=25, tol=1e-08, rNames=None, headerNames=None): chunck_generator2._filename = filename chunck_generator2._nRows = chunksize x = None y0, dta = incr_dbuilders( 'I(use.eq("Y").mul(1)) ~ age + I(age**2) + urban + livch', chunck_generator2) nCols = len(dta.column_names) if rNames is None: rNames = dta.column_names for j in range(1, maxit + 1): if x is None: x = np.zeros(nCols) ATWA = np.zeros((nCols, nCols)) ATWz = np.zeros(nCols) for data_chunk in chunck_generator2(): yb, rowA = dmatrices((y0, dta), data_chunk, NA_action="drop", return_type="dataframe") yb = yb.values.ravel() A = np.asarray(rowA[rNames], dtype=np.float32) eta = np.matmul(A, x) eta = eta.reshape(len(eta)) g = link.inverse(eta) mu_eta = link.inverse_deriv gprime = mu_eta(eta) z = np.array(eta + (yb - g) / gprime) varianceFam = family.variance linkinvFam = link.inverse g = linkinvFam(eta) varg = varianceFam(g) W = gprime**2 / varg W = W.reshape(len(W), 1) cross2 = np.matmul(np.transpose(A), np.asarray(W.reshape(-1) * z)) ATWz = ATWz + cross2 cross1 = np.matmul(np.transpose(A), np.asarray(W * A)) ATWA = ATWA + cross1 xold = x C, rank, piv = cholesky_pivot(ATWA, full_pivot=True) if rank < C.shape[1]: raise LinAlgError("Rank-deficiency detected.") x = solve_triangular(np.transpose(C), ATWz[piv - 1], lower=True) x = solve_triangular(C, x, lower=False)[piv - 1] if (np.sqrt(np.matmul(np.transpose(x - xold), x - xold)) < tol): break if headerNames is not None: x = pd.DataFrame(x, headerNames) else: x = pd.DataFrame(x, index=list(rowA)) return (x, j)
def _train(elements, model_cfg): """Construct one model per building block type""" models = {} target = model_cfg.target for model in model_cfg.sections: # Construct model formula from configuration terms = " + ".join(["1"] + [f"C({f})" for f in model.factors]) # Train model models[model.label] = GLM.from_formula( f"{target} ~ {terms}", family=Binomial(), data=filter_data(elements, {model_cfg.label_column: model.label}), ).fit(scale="X2") return models
def compute_chi2_null_test(model_results, data, dep_var, max_iter, l2_weight): """ Compute difference from null model using deviance: P(null) - P(model) ~ chi_2 """ null_formula = '%s ~ 1' % (dep_var) null_model = GLM.from_formula(null_formula, data, family=Binomial(link=logit())) null_model_results = null_model.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) model_loglike = model_results.model.loglike(model_results.params) null_model_loglike = null_model_results.model.loglike( null_model_results.params) llr = -2 * (null_model_loglike - model_loglike) model_df = model_results.model.df_model p_val = chi2.sf(llr, model_df) return llr, model_df, p_val
def setUpClass(cls): cls.forwarding_family = BinomialWrapper() cls.family = Binomial()
def __init__(self, link=L.logit): # , n=1.): # TODO: it *should* work for a constant n>1 actually, if data_weights # is equal to n self.family = Binomial(link=link)
class BinomialWrapper(FamilyWrapper): """ The wrapper of Binomial exponential family distribution, with function for per sample probability. Parameters ---------- link : a link instance, optional The default link for the Binomial family is the logit link. Available links are logit, probit, cauchy, log, and cloglog. See statsmodels.family.links for more information. Attributes ---------- family : a statsmodel Binomial family object -------- """ def __init__(self, link=L.logit): # , n=1.): # TODO: it *should* work for a constant n>1 actually, if data_weights # is equal to n self.family = Binomial(link=link) def loglike_per_sample(self, endog, mu, scale=1.): """ The function to calculate log-likelihood per sample in terms of the fitted mean response. Parameters ---------- endog : array-like of shape (n, k) or (n, ) Endogenous response variable mu : array-like of shape (n, ) Fitted mean response variable scale : float, optional Not used for the Binomial GLM. Returns ------- log_p : array-like of shape (n, ) The value of the loglikelihood function evaluated per sample (endog,mu,freq_weights,scale) as defined below. Notes -------- If the endogenous variable is binary: .. math:: log_p_{i} = (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) If the endogenous variable is binomial: .. math:: log_p_{i} = (\ln \Gamma(n+1) - \ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i * \log(\mu_i / (n_i - \mu_i)) + n * \log(1 - \mu_i/n_i)) where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as defined in Binomial initialize. This simply makes :math:`y_i` the original number of successes. """ # special setup # see _Setup_binomial(self) in generalized_linear_model.py tmp = self.family.initialize(endog, 1) endog = tmp[0] if np.shape(self.family.n) == () and self.family.n == 1: return scale * (endog * np.log(old_div(mu, (1 - mu)) + 1e-200) + np.log(1 - mu)).reshape(-1, ) else: y = endog * self.family.n # convert back to successes return scale * (special.gammaln(self.family.n + 1) - special.gammaln(y + 1) - special.gammaln(self.family.n - y + 1) + y * np.log(old_div(mu, (1 - mu))) + self.family.n * np.log(1 - mu)).reshape(-1, )
def update_peaks_fit_regression(data, NE_date_ranges, NE_var, data_name_var, round_date_var, peak_date_var, peak_date_buffer, scalar_vars, formula, max_iter, l2_weight, regression_type): """ Randomly update peak times and fit regression. """ NE_peak_dates_i = NE_date_ranges.apply(lambda x: np.random.choice(x, 1)[0]) NE_peak_dates_i_df = NE_peak_dates_i.reset_index().rename( columns={0: peak_date_var}) # data_peak_dates_i = data_date_ranges.apply(lambda x: np.random.choice(x, 1)[0]).reset_index().rename(columns={0 : peak_date_var}) data_i = pd.merge(data, NE_peak_dates_i_df, on=[NE_var, data_name_var], how='inner') # reassign peaks data_i = data_i.assign( **{ 'pre_peak': ( data_i.loc[:, round_date_var] <= data_i.loc[:, peak_date_var] - peak_date_buffer).astype(int), 'post_peak': ( data_i.loc[:, round_date_var] >= data_i.loc[:, peak_date_var] + peak_date_buffer).astype(int), 'during_peak': ((data_i.loc[:, round_date_var] > data_i.loc[:, peak_date_var] - peak_date_buffer) & (data_i.loc[:, round_date_var] < data_i.loc[:, peak_date_var] + peak_date_buffer)).astype(int), }) # add days since post-peak data_i = data_i.assign( **{ 'since_peak': data_i.loc[:, 'post_peak'] * (data_i.loc[:, round_date_var] - data_i.loc[:, peak_date_var]) }) # Z-norm all scalar vars scaler = StandardScaler() for v in scalar_vars: data_i = data_i.assign( ** {v: scaler.fit_transform(data_i.loc[:, v].values.reshape(-1, 1))}) model_full = GLM.from_formula(formula, data_i, family=Binomial(link=logit())) logging.debug( '%d/%d/%d pre/during/post data' % (data_i.loc[:, 'pre_peak'].sum(), data_i.loc[:, 'during_peak'].sum(), data_i.loc[:, 'post_peak'].sum())) if (regression_type == 'regularized_logit'): model_res_full = model_full.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) model_res_full_err = compute_err_data(model_res_full) err = model_res_full_err.loc[:, 'SE'] else: model_res_full = model_full.fit() err = model_res_full.bse params = model_res_full.params return params, err, NE_peak_dates_i
def test_weights(data, dep_var, cat_vars, scalar_vars, l2_weights): indep_formula = ' + '.join( ['C(%s)' % (cap_cat_var) for cap_cat_var in cap_cat_vars] + scalar_vars) formula = '%s ~ %s' % (dep_var, indep_formula) # convert raw data to exogenous data # need to do this to force train/test # to have same features data_rand = data.copy() np.random.shuffle(data_rand.values) model_dummy = GLM.from_formula(formula, data_rand, family=Binomial(link=logit())) exog = model_dummy.exog exog_names = model_dummy.exog_names endog = model_dummy.endog # generate cross validation folds cross_val_folds = 10 N = data_rand.shape[0] cross_val_chunk_size = float(N) / cross_val_folds cross_val_fold_train_idx = [ list( range(int(floor(i * cross_val_chunk_size)), int(ceil((i + 1) * cross_val_chunk_size)))) for i in range(cross_val_folds) ] cross_val_fold_test_idx = [ list(range(0, int(ceil(i * cross_val_chunk_size)))) + list(range(int(floor((i + 1) * cross_val_chunk_size)), N)) for i in range(cross_val_folds) ] weight_likelihoods = [] for l2_weight in l2_weights: print('testing weight = %.3f' % (l2_weight)) likelihoods_l2 = [] for i, (train_idx_i, test_idx_i) in enumerate( zip(cross_val_fold_train_idx, cross_val_fold_test_idx)): print('fold %d' % (i)) train_XY = data_rand.iloc[train_idx_i, :] test_X = exog[test_idx_i, :] test_Y = endog[test_idx_i] # fit model model_i = GLM.from_formula(formula, train_XY, family=Binomial(link=logit())) model_res_i = model_i.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.) # add 0 params for missing coefficients # to match X shape model_res_i.params = model_res_i.params.loc[exog_names].fillna( 0, inplace=False) # score test data likelihood_i = compute_log_likelihood(model_res_i.params, test_Y, test_X) likelihoods_l2.append(likelihood_i) weight_likelihoods.append(likelihoods_l2) weight_likelihoods = pd.DataFrame(np.array(weight_likelihoods), index=l2_weights) mean_weight_likelihoods = weight_likelihoods.mean(axis=0) return mean_weight_likelihoods
def run_regression(data, formula, regression_type, dep_var='anchor', out_dir='../../output', split_var=None, split_var_val=0): """ Run logit regression on data with given formula and write to file. Option: use regularized logit (reduce variable inflation). :param data: full data :param formula: regression formula :param regression_type: type of regression (logit|regularized_logit) :param dep_var: dependent variable :param out_dir: output directory :param split_var: optional variable to split data (e.g. only organization accounts) :param split_var_val: value of split value variable (if included) """ l2_weight = 0.01 max_iter = 100 model_full = GLM.from_formula(formula, data, family=Binomial(link=logit())) if (regression_type == 'regularized_logit'): model_res_full = model_full.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) else: model_res_full = model_full.fit() ## summary stats model_res_full_err = compute_err_data(model_res_full) # write to file reg_out_str = 'anchor_%s_output_%s.tsv' % (regression_type, formula.replace(' ', '')) if (split_var is not None): reg_out_str = 'anchor_%s_output_%s_split_%s=%s.tsv' % ( regression_type, formula.replace(' ', ''), split_var, split_var_val) res_out_file = os.path.join(out_dir, reg_out_str) model_res_full_err.to_csv(res_out_file, sep='\t', index=True) ## save coeffs to file => pretty print as latex # need lots of decimal points! for multiple variable correction pd.options.display.float_format = '{:,.5f}'.format tex_out_str = reg_out_str.replace('.tsv', '.tex') tex_res_out_file = os.path.join(out_dir, tex_out_str) model_res_full_err = model_res_full_err.assign( **{'coeff': model_res_full_err.index}) tex_data_cols = ['coeff', 'mean', 'SE', 'p_val'] model_res_full_err.to_latex(tex_res_out_file, columns=tex_data_cols, index=False) ## compute regression fit parameters => deviance, AIC, etc. # start with chi2 test against null model llr, model_df, p_val = compute_chi2_null_test(model_res_full, data, dep_var, max_iter, l2_weight) logging.debug('N=%d, LLR=%.5f, df=%d, p-val=%.3E' % (data.shape[0], llr, model_df, p_val)) # variance inflation factor: are some of the covariates highly collinear? # for sanity we only look at non-categorical vars cat_var_matcher = re.compile( 'C\(.+\)\[T\..+\]|Intercept' ) # format="C(var_name)[T.var_val]" ("C(username)[T.barackobama]") non_cat_params = [ param for param in model_res_full.params.index if cat_var_matcher.search(param) is None ] for param in non_cat_params: VIF_i = compute_VIF(model_res_full, param) logging.debug('VIF test: param=%s, VIF=%.3f' % (param, VIF_i)) ## compute accuracy on k-fold classification ## we would use R-squared but that doesn't work for logistic regression # first get data into usable format n_splits = 10 accs = k_fold_acc(model_full.exog, model_full.endog, k=n_splits) mean_acc = np.mean(accs) se_acc = np.std(accs) / n_splits**.5 logging.debug('%d-fold mean accuracy = %.3f +/- %.3f' % (n_splits, mean_acc, se_acc))
def irls_incremental_dm(filename, chunksize, yPos=None, family=Binomial(), link=Logit(), maxit=25, tol=1e-08, header=None, headerNames=None): x = None nRows = chunksize tmp = pd.read_csv(filename, delimiter=',', header=None, nrows=1, parse_dates=[1]) nCols = tmp.shape[1] - 1 if yPos is None: if header is None: yPos = nCols else: yPos = tmp[nCols].to_string(index=False)[1:] for j in range(1, maxit + 1): generator = _generator(filename=filename, header=header, chunk_size=nRows) if x is None: x = np.zeros(nCols) ATWA = np.zeros((nCols, nCols)) ATWz = np.zeros(nCols) for rowA in generator: yb = np.asarray(rowA[yPos].astype(np.float32)) A = np.asarray(rowA.drop([yPos], axis=1), dtype=np.float32) eta = np.matmul(A, x) eta = eta.reshape(len(eta)) g = link.inverse(eta) mu_eta = link.inverse_deriv gprime = mu_eta(eta) z = np.array(eta + (yb - g) / gprime) varianceFam = family.variance linkinvFam = link.inverse g = linkinvFam(eta) varg = varianceFam(g) W = gprime**2 / varg W = W.reshape(len(W), 1) cross2 = np.matmul(np.transpose(A), np.asarray(W.reshape(-1) * z)) ATWz = ATWz + cross2 cross1 = np.matmul(np.transpose(A), np.asarray(W * A)) ATWA = ATWA + cross1 xold = x C, rank, piv = cholesky_pivot(ATWA, full_pivot=True) if rank < C.shape[1]: raise LinAlgError("Rank-deficiency detected.") x = solve_triangular(np.transpose(C), ATWz[piv - 1], lower=True) x = solve_triangular(C, x, lower=False)[piv - 1] if (np.sqrt(np.matmul(np.transpose(x - xold), x - xold)) < tol): break if headerNames is not None: x = pd.DataFrame(x, headerNames) elif header is not None: x = pd.DataFrame(x, index=list(rowA.drop([yPos], axis=1))) else: x = pd.DataFrame(x) return (x, j)
def glm_svd_newton_dm(X, y, family=Binomial(), link=Logit(), maxit=25, tol=1e-08, stol=1e-08, singular_ok=True, weights=None, reg_method="column projection"): S = list(svd(X)) # S[0]=u S[1]=d S[2]=v V = S[2] nVars = S[2].shape[1] idx = np.arange(nVars) i = (S[1] / S[1][0]) > stol k = np.sum(i) pivot = np.arange(nVars) if (k < nVars): if reg_method == "column projection": Q, R, pivot = qr(S[2][:, :k], pivoting=True) idx = np.argsort(pivot[:k]) omit = pivot[-(nvars - k):] S_new = svd(X[~idx.isin(omit)]) if ((S_new[-1] / S_new[0]) <= stol): print( "Whoops! SVD subset selection failed, trying dqrdc2 on full matrix" ) if (len(X) == 3): Q = np.matmul(S[2], S[1] + S[0].T) else: Q, R, pivot = qr(X, pivoting=True) pivot = Q[2] idx = np.argsort(pivot[:k]) omit = pivot[-(nvars - k):] S_new = svd(X[~idx.isin(omit)]) S = S_new print("omitting column(s) ", omit) s = np.zeros(nVars) nobs = y.shape[0] nVars = S[2].shape[1] if weights is None: weights = np.ones(nobs) varianceFam = family.variance linkinvFam = link.inverse mu_eta = link.inverse_deriv etastart = None if len(y.shape) == 1: mustart = (weights * y + 0.5) / (weights + 1) else: n = y + weights ytmp = 0 if not n else y[:, 1] / n mustart = (n * ytmp + 0.5) / (n + 1) eta = link(mustart) dev_resids = lambda y, m, w: family.resid_dev(y, m, w)**2 dev = sum(dev_resids(y, linkinvFam(eta), weights)) devold = 0 for j in range(maxit): g = linkinvFam(eta) varg = varianceFam(g) if (np.any(np.isnan(varg))): raise LinAlgError("NAs in variance of the inverse link function") if (np.any(varg == 0)): raise LinAlgError( "Zero value in variance of the inverse link function") gprime = mu_eta(eta) if (np.any(np.isnan(gprime))): raise LinAlgError("NAs in the inverse link function derivative") z = eta + (y - g) / gprime W = weights * (gprime**2 / varg) W = W.reshape(len(W), 1) cross1 = np.matmul(np.transpose(S[0][:, :7]), W * S[0][:, :7]) C, rank_bn, piv = cholesky_pivot(cross1, full_pivot=True) cross2 = np.matmul(np.transpose(np.asarray(S[0][:, :7])), np.asarray(W.reshape(-1) * z))[piv - 1] s = solve_triangular(np.transpose(C), cross2, lower=True) s = solve_triangular(C, s, lower=False)[np.argsort(piv)] eta = np.matmul(S[0][:, :7], s) dev = np.sum(dev_resids(y, g, weights)) if (np.absolute(dev - devold) / (0.1 + np.absolute(dev)) < tol): break devold = dev x = np.empty(X.shape[1]) x[:] = np.nan inV = 1 / S[1] if (reg_method == "minimum norm"): inV[inV > 1 / stol] = 1 x[idx] = np.matmul(S[2].T, (s * inV).reshape(-1, 1)).reshape(-1) x = pd.DataFrame(x, list(X)) return (x, j + 1, k, pivot ) # coefficients=x,iterations=j, rank=k, pivot=pivot
class BinomialWrapper(FamilyWrapper): """ The wrapper of Binomial exponential family distribution, with function for per sample probability. Parameters ---------- link : a link instance, optional The default link for the Binomial family is the logit link. Available links are logit, probit, cauchy, log, and cloglog. See statsmodels.family.links for more information. Attributes ---------- family : a statsmodel Binomial family object -------- """ def __init__(self, link=L.logit): # , n=1.): # TODO: it *should* work for a constant n>1 actually, if data_weights # is equal to n self.family = Binomial(link=link) def loglike_per_sample(self, endog, mu, scale=1.): """ The function to calculate log-likelihood per sample in terms of the fitted mean response. Parameters ---------- endog : array-like of shape (n, k) or (n, ) Endogenous response variable mu : array-like of shape (n, ) Fitted mean response variable scale : float, optional Not used for the Binomial GLM. Returns ------- log_p : array-like of shape (n, ) The value of the loglikelihood function evaluated per sample (endog,mu,freq_weights,scale) as defined below. Notes -------- If the endogenous variable is binary: .. math:: log_p_{i} = (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) If the endogenous variable is binomial: .. math:: log_p_{i} = (\ln \Gamma(n+1) - \ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i * \log(\mu_i / (n_i - \mu_i)) + n * \log(1 - \mu_i/n_i)) where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as defined in Binomial initialize. This simply makes :math:`y_i` the original number of successes. """ # special setup # see _Setup_binomial(self) in generalized_linear_model.py tmp = self.family.initialize(endog, 1) endog = tmp[0] if np.shape(self.family.n) == () and self.family.n == 1: return scale * (endog * np.log(old_div(mu, (1 - mu)) + 1e-200) + np.log(1 - mu)).reshape(-1,) else: y = endog * self.family.n # convert back to successes return scale * (special.gammaln(self.family.n + 1) - special.gammaln(y + 1) - special.gammaln(self.family.n - y + 1) + y * np.log(old_div(mu, (1 - mu))) + self.family.n * np.log(1 - mu)).reshape(-1,)