def _check_coltype(self, X): for col in as_list(self.columns): if isinstance(col, str): if isinstance(X, np.ndarray): raise ValueError( f"column {col} is a string but datatype receive is numpy." ) if isinstance(X, pd.DataFrame): if col not in X.columns: raise ValueError(f"column {col} is not in {X.columns}") if isinstance(col, int): if col not in range(np.atleast_2d(np.array(X)).shape[1]): raise ValueError( f"column {col} is out of bounds for input shape {X.shape}" )
def fit(self, X, y=None): """Learn the projection required to make the dataset orthogonal to sensitive columns.""" self._check_coltype(X) self.col_ids_ = [ v if isinstance(v, int) else self._col_idx(X, v) for v in as_list(self.columns) ] X = check_array(X, estimator=self) X_fair = X.copy() v_vectors = self._make_v_vectors(X, self.col_ids_) # gram smidt process but only on sensitive attributes for i, col in enumerate(X_fair.T): for v in v_vectors.T: X_fair[:, i] = X_fair[:, i] - _vector_projection(X_fair[:, i], v) # we want to learn matrix P: X P = X_fair # this means we first need to create X_fair in order to learn P self.projection_, resid, rank, s = np.linalg.lstsq(X, X_fair, rcond=None) return self