def hash(self) -> Tuple[Tuple[str, ...], ...]: hashes: List[Tuple[str, ...]] = [] hasher = hash_func() if self._cat is not None: for col in self._cat: hasher.update( ascontiguousarray( self._cat[col].cat.codes.to_numpy()).data) hashes.append((hasher.hexdigest(), )) hasher = _reset(hasher) if self._cont is not None: for col in self._cont: hasher.update( ascontiguousarray(self._cont[col].to_numpy()).data) hashes.append((hasher.hexdigest(), )) hasher = _reset(hasher) if self._interactions is not None: for interact in self._interactions: hashes.extend(interact.hash) # Add weight hash if provided if self._weights is not None: hasher = hash_func() hasher.update(ascontiguousarray(self._weights.data)) hashes.append((hasher.hexdigest(), )) return tuple(sorted(hashes))
def lsmr_annihilate(x: csc_matrix, y: ndarray, use_cache: bool = True, x_hash=None, **lsmr_options) -> ndarray: r""" Removes projection of x on y from y Parameters ---------- x : csc_matrix Sparse array of regressors y : ndarray Array with shape (nobs, nvar) use_cache : bool Flag indicating whether results should be stored in the cache, and retrieved if available. x_hash : object Hashable object representing the values in x lsmr_options: dict Dictionary of options to pass to scipy.sparse.linalg.lsmr Returns ------- resids : ndarray Returns the residuals from regressing y on x, (nobs, nvar) Notes ----- Residuals are estiamted column-by-column as .. math:: \hat{\epsilon}_{j} = y_{j} - x^\prime \hat{\beta} where :math:`\hat{\beta}` is computed using lsmr. """ use_cache = use_cache and x_hash is not None regressor_hash = x_hash if x_hash is not None else '' default_opts = dict(atol=1e-8, btol=1e-8, show=False) default_opts.update(lsmr_options) resids = [] for i in range(y.shape[1]): _y = y[:, i:i + 1] variable_digest = '' if use_cache: hasher = hash_func() hasher.update(ascontiguousarray(_y.data)) variable_digest = hasher.hexdigest() if use_cache and variable_digest in _VARIABLE_CACHE[regressor_hash]: resid = _VARIABLE_CACHE[regressor_hash][variable_digest] else: beta = lsmr(x, _y, **default_opts)[0] resid = y[:, i:i + 1] - (x.dot(csc_matrix(beta[:, None]))).A _VARIABLE_CACHE[regressor_hash][variable_digest] = resid resids.append(resid) if resids: return column_stack(resids) else: return empty_like(y)
def hash(self) -> List[Tuple[str, ...]]: """ Construct a hash that will be invariant for any permutation of inputs that produce the same fit when used as regressors""" # Sorted hashes of any categoricals hasher = hash_func() cat_hashes = [] cat = self.cat for col in cat: hasher.update(ascontiguousarray(self.cat[col].cat.codes.to_numpy().data)) cat_hashes.append(hasher.hexdigest()) hasher = _reset(hasher) sorted_hashes = tuple(sorted(cat_hashes)) hashes = [] cont = self.cont for col in cont: hasher.update(ascontiguousarray(cont[col].to_numpy()).data) hashes.append(sorted_hashes + (hasher.hexdigest(),)) hasher = _reset(hasher) return sorted(hashes)
def _reset(hasher: Hasher) -> Hasher: try: hasher.reset() return hasher except AttributeError: return hash_func()