def hash(self) -> Tuple[Tuple[str, ...], ...]: hashes: List[Tuple[str, ...]] = [] hasher = hash_func() if self._cat is not None: for col in self._cat: hasher.update( ascontiguousarray( to_numpy(get_codes(self._cat[col].cat)).data)) hashes.append((hasher.hexdigest(), )) hasher = _reset(hasher) if self._cont is not None: for col in self._cont: hasher.update(ascontiguousarray( to_numpy(self._cont[col]).data)) hashes.append((hasher.hexdigest(), )) hasher = _reset(hasher) if self._interactions is not None: for interact in self._interactions: hashes.extend(interact.hash) # Add weight hash if provided if self._weights is not None: hasher = hash_func() hasher.update(ascontiguousarray(self._weights.data)) hashes.append((hasher.hexdigest(), )) return tuple(sorted(hashes))
def _post_estimation(self, params: ndarray, cov_estimator, cov_type: str): columns = self._columns index = self._index eps = self.resids(params) fitted = DataFrame(self._dependent.ndarray - eps, index=self._dependent.rows, columns=['fitted_values']) absorbed_effects = DataFrame(to_numpy(self._absorbed_dependent) - to_numpy(fitted), columns=['absorbed_effects'], index=self._dependent.rows) weps = self.wresids(params) cov = cov_estimator.cov debiased = cov_estimator.debiased residual_ss = (weps.T @ weps)[0, 0] w = self.weights.ndarray root_w = sqrt(w) e = self._dependent.ndarray * root_w if self.has_constant: e = e - root_w * average(self._dependent.ndarray, weights=w) total_ss = float(e.T @ e) r2 = max(1 - residual_ss / total_ss, 0.0) e = to_numpy(self._absorbed_dependent) # already scaled by root_w # If absorbing contains a constant, but exog does not, no need to demean if self._const_col is not None: col = self._const_col x = to_numpy(self._absorbed_exog)[:, col:col + 1] mu = (lstsq(x, to_numpy(e))[0]).squeeze() e = e - x * mu aborbed_total_ss = float(e.T @ e) r2_absorbed = max(1 - residual_ss / aborbed_total_ss, 0.0) fstat = self._f_statistic(params, cov, debiased) out = {'params': Series(params.squeeze(), columns, name='parameter'), 'eps': Series(eps.squeeze(), index=index, name='residual'), 'weps': Series(weps.squeeze(), index=index, name='weighted residual'), 'cov': DataFrame(cov, columns=columns, index=columns), 's2': float(cov_estimator.s2), 'debiased': debiased, 'residual_ss': float(residual_ss), 'total_ss': float(total_ss), 'r2': float(r2), 'fstat': fstat, 'vars': columns, 'instruments': [], 'cov_config': cov_estimator.config, 'cov_type': cov_type, 'method': self._method, 'cov_estimator': cov_estimator, 'fitted': fitted, 'original_index': self._original_index, 'absorbed_effects': absorbed_effects, 'absorbed_r2': r2_absorbed} return out
def _drop_missing(self) -> NDArray: missing = to_numpy(self.dependent.isnull) missing |= to_numpy(self.exog.isnull) missing |= to_numpy(self._absorb_inter.cat.isnull().any(1)) missing |= to_numpy(self._absorb_inter.cont.isnull().any(1)) for interact in self._interaction_list: missing |= to_numpy(interact.isnull) if npany(missing): self.dependent.drop(missing) self.exog.drop(missing) self._absorb_inter.drop(missing) for interact in self._interaction_list: interact.drop(missing) missing_warning(missing) return missing
def category_continuous_interaction(cat: AnyPandas, cont: AnyPandas, precondition: bool = True) -> csc_matrix: """ Parameters ---------- cat : Series Categorical series to convert to dummy variables cont : {Series, DataFrame} Continuous variable values to use in the dummy interaction precondition : bool Flag whether dummies should be preconditioned Returns ------- csc_matrix Sparse matrix of dummy interactions with unit column norm """ codes = get_codes(category_product(cat).cat) interact = csc_matrix( (to_numpy(cont).flat, (arange(codes.shape[0]), codes))) if not precondition: return interact else: contioned = preconditioner(interact)[0] assert isinstance(contioned, csc_matrix) return contioned
def test_drop_missing(): gen = generate_data(2, True, 2, factor_format="pandas", ncont=0, cont_interactions=1) gen.y[::53] = np.nan gen.x[::79] = np.nan with pytest.warns(MissingValueWarning): AbsorbingLS(gen.y, gen.x, absorb=gen.absorb, interactions=gen.interactions) gen = generate_data(2, True, 2, factor_format="pandas", ncont=0, cont_interactions=1) for col in gen.absorb: gen.absorb[col] = gen.absorb[col].astype("int64").astype("object") col_iloc = gen.absorb.columns.get_loc(col) gen.absorb.iloc[::91, col_iloc] = np.nan gen.absorb[col] = pd.Categorical(to_numpy(gen.absorb[col])) with pytest.warns(MissingValueWarning): AbsorbingLS(gen.y, gen.x, absorb=gen.absorb, interactions=gen.interactions)
def test_interaction_cont_only(cont): interact = Interaction(cont=cont) assert interact.nobs == cont.shape[0] assert_frame_equal(cont, interact.cont) expected = to_numpy(cont) actual = interact.sparse assert isinstance(actual, csc_matrix) assert_allclose(expected, actual.A)
def test_absorbing_regressors_hash(cat, cont, interact, weights): areg = AbsorbingRegressor(cat=cat, cont=cont, interactions=interact, weights=weights) # Build hash hashes = [] for col in cat: hashes.append( (hasher.single(to_numpy(get_codes(cat[col].cat)).data), )) for col in cont: hashes.append((hasher.single(to_numpy(cont[col]).data), )) hashes = sorted(hashes) if interact is not None: for inter in interact: hashes.extend(inter.hash) if weights is not None: hashes.append((hasher.single(weights.data), )) hashes = tuple(sorted(hashes)) assert hashes == areg.hash
def wresids(self, params: ndarray): """ Compute weighted model residuals Parameters ---------- params : ndarray Model parameters (nvar by 1) Returns ------- wresids : ndarray Weighted model residuals Notes ----- Uses weighted versions of data instead of raw data. Identical to resids if all weights are unity. """ return to_numpy(self._absorbed_dependent) - to_numpy(self._absorbed_exog) @ params
def hash(self): """ Construct a hash that will be invariant for any permutation of inputs that produce the same fit when used as regressors""" # Sorted hashes of any categoricals hasher = hash_func() cat_hashes = [] cat = self.cat for col in cat: hasher.update(ascontiguousarray(to_numpy(get_codes(self.cat[col].cat)).data)) cat_hashes.append(hasher.hexdigest()) hasher = _reset(hasher) cat_hashes = tuple(sorted(cat_hashes)) hashes = [] cont = self.cont for col in cont: hasher.update(ascontiguousarray(to_numpy(cont[col]).data)) hashes.append(cat_hashes + (hasher.hexdigest(),)) hasher = _reset(hasher) return sorted(hashes)
def test_interaction_cat_cont(cat, cont): interact = Interaction(cat=cat, cont=cont) assert interact.nobs == cat.shape[0] assert_frame_equal(cat, interact.cat) assert_frame_equal(cont, interact.cont) base = category_interaction(category_product(cat), precondition=False).A expected = [] for i in range(cont.shape[1]): element = base.copy() element[np.where(element)] = to_numpy(cont.iloc[:, i]) expected.append(element) expected = np.column_stack(expected) actual = interact.sparse assert isinstance(actual, csc_matrix) assert_allclose(expected, interact.sparse.A)
def test_against_ols(ols_data): mod = AbsorbingLS( ols_data.y, ols_data.x, absorb=ols_data.absorb, interactions=ols_data.interactions, weights=ols_data.weights, ) res = mod.fit() absorb = [] has_dummy = False if ols_data.absorb is not None: absorb.append(to_numpy(ols_data.absorb.cont)) if ols_data.absorb.cat.shape[1] > 0: dummies = dummy_matrix(ols_data.absorb.cat, precondition=False)[0] assert isinstance(dummies, sp.csc_matrix) absorb.append(dummies.A) has_dummy = ols_data.absorb.cat.shape[1] > 0 if ols_data.interactions is not None: for interact in ols_data.interactions: absorb.append(interact.sparse.A) _x = ols_data.x if absorb: absorb = np.column_stack(absorb) if np.any(np.ptp(_x, 0) == 0) and has_dummy: if ols_data.weights is None: absorb = annihilate(absorb, np.ones((absorb.shape[0], 1))) else: root_w = np.sqrt(mod.weights.ndarray) wabsorb = annihilate(root_w * absorb, root_w) absorb = (1.0 / root_w) * wabsorb rank = np.linalg.matrix_rank(absorb) if rank < absorb.shape[1]: a, b = np.linalg.eig(absorb.T @ absorb) order = np.argsort(a)[::-1] a, b = a[order], b[:, order] z = absorb @ b absorb = z[:, :rank] _x = np.column_stack([_x, absorb]) ols_mod = _OLS(ols_data.y, _x, weights=ols_data.weights) ols_res = ols_mod.fit() assert_results_equal(ols_res, res)
def _regressors(self) -> csc_matrix: regressors = [] if self._cat is not None and self._cat.shape[1] > 0: regressors.append(dummy_matrix(self._cat, precondition=False)[0]) if self._cont is not None and self._cont.shape[1] > 0: regressors.append(csc_matrix(to_numpy(self._cont))) if self._interactions is not None: regressors.extend([interact.sparse for interact in self._interactions]) if regressors: regressor_mat = sp.hstack(regressors, format='csc') approx_rank = regressor_mat.shape[1] self._approx_rank = approx_rank if self._weights is not None: return (sp.diags(sqrt(self._weights.squeeze())).dot(regressor_mat)).asformat('csc') return regressor_mat else: self._approx_rank = 0 return csc_matrix(empty((0, 0)))
def fit( self, *, cov_type: str = "robust", debiased: bool = False, lsmr_options: Optional[Dict[str, Union[float, bool]]] = None, use_cache: bool = True, **cov_config: Any, ) -> AbsorbingLSResults: """ Estimate model parameters Parameters ---------- cov_type : str, optional Name of covariance estimator to use. Supported covariance estimators are: * 'unadjusted', 'homoskedastic' - Classic homoskedastic inference * 'robust', 'heteroskedastic' - Heteroskedasticity robust inference * 'kernel' - Heteroskedasticity and autocorrelation robust inference * 'cluster' - One-way cluster dependent inference. Heteroskedasticity robust debiased : bool, optional Flag indicating whether to debiased the covariance estimator using a degree of freedom adjustment. **cov_config Additional parameters to pass to covariance estimator. The list of optional parameters differ according to ``cov_type``. See the documentation of the alternative covariance estimators for the complete list of available commands. lsmr_options : dict Dictionary of options to pass to scipy.sparse.linalg.lsmr use_cache : bool Flag indicating whether the variables, once purged from the absorbed variables and interactions, should be stored in the cache, and retrieved if available. Cache can dramatically speed up re-fitting large models when the set of absorbed variables and interactions are identical. Returns ------- AbsorbingLSResults Results container Notes ----- Additional covariance parameters depend on specific covariance used. The see the docstring of specific covariance estimator for a list of supported options. Defaults are used if no covariance configuration is provided. If use_cache is True, then variables are hashed based on their contents using either a 64 bit value (if xxhash is installed) or a 256 bit value. This allows variables to be reused in different models if the set of absorbing variables and interactions is held constant. See also -------- linearmodels.iv.covariance.HomoskedasticCovariance linearmodels.iv.covariance.HeteroskedasticCovariance linearmodels.iv.covariance.KernelCovariance linearmodels.iv.covariance.ClusteredCovariance """ if self._absorbed_dependent is None: self._first_time_fit(use_cache, lsmr_options) self._x = exog_resid = to_numpy(self.absorbed_exog) dep_resid = to_numpy(self.absorbed_dependent) if self._exog.shape[1] == 0: params = empty((0, 1)) else: if exog_resid.shape[1]: check_absorbed(exog_resid, self.exog.cols) params = lstsq(exog_resid, dep_resid, rcond=None)[0] self._num_params += exog_resid.shape[1] cov_estimator = COVARIANCE_ESTIMATORS[cov_type] cov_config["debiased"] = debiased cov_config["kappa"] = 0.0 cov_config_copy = {k: v for k, v in cov_config.items()} if "center" in cov_config_copy: del cov_config_copy["center"] cov_estimator_inst = cov_estimator(exog_resid, dep_resid, exog_resid, params, **cov_config_copy) results = {"kappa": 0.0, "liml_kappa": 0.0} pe = self._post_estimation(params, cov_estimator_inst, cov_type) results.update(pe) results["df_model"] = self._num_params return AbsorbingLSResults(results, self)
def _post_estimation( self, params: NDArray, cov_estimator: Union[HomoskedasticCovariance, HeteroskedasticCovariance, KernelCovariance, ClusteredCovariance, ], cov_type: str, ) -> Dict[str, Any]: columns = self._columns index = self._index eps = self.resids(params) fitted = DataFrame( self._dependent.ndarray - eps, index=self._dependent.rows, columns=["fitted_values"], ) absorbed_effects = DataFrame( to_numpy(self._absorbed_dependent) - to_numpy(fitted), columns=["absorbed_effects"], index=self._dependent.rows, ) weps = self.wresids(params) cov = cov_estimator.cov debiased = cov_estimator.debiased residual_ss = (weps.T @ weps)[0, 0] w = self.weights.ndarray root_w = sqrt(w) e = self._dependent.ndarray * root_w if self.has_constant: e = e - root_w * average(self._dependent.ndarray, weights=w) total_ss = float(e.T @ e) r2 = max(1 - residual_ss / total_ss, 0.0) e = to_numpy(self._absorbed_dependent) # already scaled by root_w # If absorbing contains a constant, but exog does not, no need to demean if self._const_col is not None: col = self._const_col x = to_numpy(self._absorbed_exog)[:, col:col + 1] mu = (lstsq(x, e, rcond=None)[0]).squeeze() e = e - x * mu aborbed_total_ss = float(e.T @ e) r2_absorbed = max(1 - residual_ss / aborbed_total_ss, 0.0) fstat = self._f_statistic(params, cov, debiased) out = { "params": Series(params.squeeze(), columns, name="parameter"), "eps": Series(eps.squeeze(), index=index, name="residual"), "weps": Series(weps.squeeze(), index=index, name="weighted residual"), "cov": DataFrame(cov, columns=columns, index=columns), "s2": float(cov_estimator.s2), "debiased": debiased, "residual_ss": float(residual_ss), "total_ss": float(total_ss), "r2": float(r2), "fstat": fstat, "vars": columns, "instruments": [], "cov_config": cov_estimator.config, "cov_type": cov_type, "method": self._method, "cov_estimator": cov_estimator, "fitted": fitted, "original_index": self._original_index, "absorbed_effects": absorbed_effects, "absorbed_r2": r2_absorbed, } return out
def test_interaction_cat_cont_convert(cat, cont): base = Interaction(cat, cont) interact = Interaction(to_numpy(cat), cont) assert_allclose(base.sparse.A, interact.sparse.A)