def __call__( self, data: DataTuple, split_id: int = 0 ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]: random_seed = self._get_seed(split_id) train_indexes, test_indexes = generate_proportional_split_indexes( data, train_percentage=self.train_percentage, random_seed=random_seed) train: DataTuple = DataTuple( x=data.x.iloc[train_indexes].reset_index(drop=True), s=data.s.iloc[train_indexes].reset_index(drop=True), y=data.y.iloc[train_indexes].reset_index(drop=True), name=f"{data.name} - Train", ) test: DataTuple = DataTuple( x=data.x.iloc[test_indexes].reset_index(drop=True), s=data.s.iloc[test_indexes].reset_index(drop=True), y=data.y.iloc[test_indexes].reset_index(drop=True), name=f"{data.name} - Test", ) # assert that no data points got lost anywhere assert len(data) == len(train) + len(test) split_info: Dict[str, float] = {"seed": random_seed} return train, test, split_info
def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]: """Generate fair features with the given data asynchronously. Args: train: training data test: test data Returns: a tuple of the pre-processed training data and the test data """ self.model_path = self.model_dir / f"model_{self.name}.joblib" with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) # ================================ write data to files ================================ train_path, test_path = tmp_path / "train.npz", tmp_path / "test.npz" train.to_npz(train_path) # ========================== generate commandline arguments =========================== transformed_train_path = tmp_path / "transformed_train.npz" cmd = self._fit_script_command(train_path, transformed_train_path, self.model_path) # ============================= run the generated command ============================= self._call_script(cmd + ["--mode", "fit"]) # ================================== load results ===================================== transformed_train = DataTuple.from_npz(transformed_train_path) # prefix the name of the algorithm to the dataset name transformed_train = transformed_train.replace( name=None if train.name is None else f"{self.name}: {train.name}" ) return self, transformed_train
def main() -> None: """Main method to run model.""" args = VfaeArgs(explicit_bool=True).parse_args() set_seed(args.seed) if args.mode == "run": assert args.train is not None assert args.new_train is not None assert args.test is not None assert args.new_test is not None train, test = load_data_from_flags(args) save_transformations(train_and_transform(train, test, args), args) elif args.mode == "fit": assert args.model is not None assert args.train is not None assert args.new_train is not None train = DataTuple.from_npz(Path(args.train)) enc = fit(train, args) transformed_train = transform(enc, train, args) transformed_train.to_npz(Path(args.new_train)) dump(enc, Path(args.model)) elif args.mode == "transform": assert args.model is not None assert args.test is not None assert args.new_test is not None test = DataTuple.from_npz(Path(args.test)) model = load(Path(args.model)) transformed_test = transform(model, test, args) transformed_test.to_npz(Path(args.new_test))
def main() -> None: """Load data from feather files, pass it to `train_and_transform` and then save the result.""" args = BeutelArgs().parse_args() if args.mode == "run": assert args.train is not None assert args.new_train is not None assert args.test is not None assert args.new_test is not None train, test = load_data_from_flags(args) save_transformations(train_and_transform(train, test, args), args) elif args.mode == "fit": assert args.model is not None assert args.train is not None assert args.new_train is not None train = DataTuple.from_npz(Path(args.train)) transformed_train, enc = fit(train, args) transformed_train.to_npz(Path(args.new_train)) dump(enc, Path(args.model)) elif args.mode == "transform": assert args.model is not None assert args.test is not None assert args.new_test is not None test = DataTuple.from_npz(Path(args.test)) model = load(Path(args.model)) transformed_test = transform(test, model, args) transformed_test.to_npz(Path(args.new_test))
def domain_split(datatup: DataTuple, tr_cond: str, te_cond: str, seed: int = 888) -> Tuple[DataTuple, DataTuple]: """Splits a datatuple based on a condition. Args: datatup: DataTuple tr_cond: condition for the training set te_cond: condition for the test set Returns: Tuple of DataTuple split into train and test. The test is all those that meet the test condition plus the same percentage again of the train set. """ dataset = datatup.x train_dataset = dataset_from_cond(dataset, tr_cond) test_dataset = dataset_from_cond(dataset, te_cond) assert train_dataset.shape[0] + test_dataset.shape[0] == dataset.shape[0] test_pct = test_dataset.shape[0] / dataset.shape[0] train_pct = 1 - test_pct train_train_pcnt = (1 - (test_pct * 2)) / train_pct train_train = train_dataset.sample(frac=train_train_pcnt, random_state=seed) test_train = train_dataset.drop(train_train.index, axis="index") # type: ignore[arg-type] test = pd.concat([test_train, test_dataset], axis="index") train_x = datatup.x.iloc[train_train.index].reset_index( drop=True) # type: ignore[call-overload] train_s = datatup.s.iloc[train_train.index].reset_index( drop=True) # type: ignore[call-overload] train_y = datatup.y.iloc[train_train.index].reset_index( drop=True) # type: ignore[call-overload] train_datatup = DataTuple(x=train_x, s=train_s, y=train_y, name=datatup.name) test_x = datatup.x.iloc[test.index].reset_index( drop=True) # type: ignore[call-overload] test_s = datatup.s.iloc[test.index].reset_index( drop=True) # type: ignore[call-overload] test_y = datatup.y.iloc[test.index].reset_index( drop=True) # type: ignore[call-overload] test_datatup = DataTuple(x=test_x, s=test_s, y=test_y, name=datatup.name) return train_datatup, test_datatup
def fold_data(data: DataTuple, folds: int) -> Iterator[Tuple[DataTuple, DataTuple]]: """So much love to sklearn for making their source code open.""" indices: np.ndarray = np.arange(data.x.shape[0]) fold_sizes: np.ndarray = np.full(folds, data.x.shape[0] // folds, dtype=np.int32) fold_sizes[:data.x.shape[0] % folds] += np.int32(1) current = 0 for i, fold_size in enumerate(fold_sizes): start, stop = current, int(current + fold_size) val_inds: np.ndarray = indices[start:stop] train_inds = np.array([i for i in indices if i not in val_inds]) # probably inefficient train_x = data.x.iloc[train_inds].reset_index( drop=True) # type: ignore[call-overload] train_s = data.s.iloc[train_inds].reset_index( drop=True) # type: ignore[call-overload] train_y = data.y.iloc[train_inds].reset_index( drop=True) # type: ignore[call-overload] assert train_x.shape == (len(train_inds), data.x.shape[1]) assert train_s.shape == (len(train_inds), data.s.shape[1]) assert train_y.shape == (len(train_inds), data.y.shape[1]) val_x = data.x.iloc[val_inds].reset_index( drop=True) # type: ignore[call-overload] val_s = data.s.iloc[val_inds].reset_index( drop=True) # type: ignore[call-overload] val_y = data.y.iloc[val_inds].reset_index( drop=True) # type: ignore[call-overload] assert val_x.shape == (len(val_inds), data.x.shape[1]) assert val_s.shape == (len(val_inds), data.s.shape[1]) assert val_y.shape == (len(val_inds), data.y.shape[1]) yield DataTuple(x=train_x, s=train_s, y=train_y, name=f"{data.name} - train fold {i}"), DataTuple( x=val_x, s=val_s, y=val_y, name=f"{data.name} - test fold {i}") current = stop
def train_and_transform(train: DataTuple, test: TestTuple, flags: VfaeArgs) -> Tuple[DataTuple, TestTuple]: """Train the model and transform the dataset. Args: train: test: flags: Returns: Tuple of Encoded Train Dataset and Test Dataset. """ dataset = get_dataset_obj_by_name(flags.dataset)() # Set up the data train_data = CustomDataset(train) train_loader = DataLoader(train_data, batch_size=flags.batch_size) test_data = TestDataset(test) test_loader = DataLoader(test_data, batch_size=flags.batch_size) # Build Network model = VFAENetwork( dataset, flags.supervised, train_data.xdim, latent_dims=50, z1_enc_size=flags.z1_enc_size, z2_enc_size=flags.z2_enc_size, z1_dec_size=flags.z1_dec_size, ).to("cpu") optimizer = optim.Adam(model.parameters(), lr=1e-3) # Run Network for epoch in range(int(flags.epochs)): train_model(epoch, model, train_loader, optimizer, flags) # Transform output post_train: List[List[float]] = [] post_test: List[List[float]] = [] model.eval() with torch.no_grad(): for _x, _s, _ in train_loader: z1_mu, z1_logvar = model.encode_z1(_x, _s) z1 = model.reparameterize(z1_mu, z1_logvar) post_train += z1.data.tolist() for _x, _s in test_loader: z1_mu, z1_logvar = model.encode_z1(_x, _s) z1 = model.reparameterize(z1_mu, z1_logvar) post_test += z1.data.tolist() return ( DataTuple(x=pd.DataFrame(post_train), s=train.s, y=train.y, name=f"VFAE: {train.name}"), TestTuple(x=pd.DataFrame(post_test), s=test.s, name=f"VFAE: {test.name}"), )
def transform(model: VFAENetwork, dataset: T, flags) -> T: """Transform the dataset.""" data: Union[CustomDataset, TestDataset] if isinstance(dataset, DataTuple): data = CustomDataset(dataset) loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False) elif isinstance(dataset, TestTuple): data = TestDataset(dataset) loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False) post_train: List[List[float]] = [] model.eval() with torch.no_grad(): for sample in loader: if isinstance(dataset, DataTuple): _x, _s, _ = sample elif isinstance(dataset, TestTuple): _x, _s = sample z1_mu, z1_logvar = model.encode_z1(_x, _s) # z1 = model.reparameterize(z1_mu, z1_logvar) post_train += z1_mu.data.tolist() if isinstance(dataset, DataTuple): return DataTuple(x=pd.DataFrame(post_train), s=dataset.s, y=dataset.y, name=f"VFAE: {dataset.name}") elif isinstance(dataset, TestTuple): return TestTuple(x=pd.DataFrame(post_train), s=dataset.s, name=f"VFAE: {dataset.name}")
def main(): """This function runs the Agarwal model as a standalone program.""" args: AgarwalArgs = AgarwalArgs().parse_args() train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz( Path(args.test)) Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz( Path(args.predictions))
def train_and_transform( train: DataTuple, test: TestTuple, flags: ZemelArgs ) -> (Tuple[DataTuple, TestTuple]): """Train the Zemel model and return the transformed features of the train and test sets.""" np.random.seed(flags.seed) sens_col = train.s.columns[0] training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() ytrain_sensitive = train.y.loc[train.s[sens_col] == 0].to_numpy() ytrain_nonsensitive = train.y.loc[train.s[sens_col] == 1].to_numpy() print_interval = 100 verbose = False num_train_samples, features_dim = train.x.shape # Initialize the LFR optim objective parameters parameters_initialization = np.random.uniform( size=flags.clusters + features_dim * flags.clusters ) bnd = [(0, 1)] * flags.clusters + [(None, None)] * features_dim * flags.clusters # type: ignore[operator] LFR_optim_objective.steps = 0 # type: ignore[attr-defined] learned_model = optim.fmin_l_bfgs_b( LFR_optim_objective, x0=parameters_initialization, epsilon=1e-5, args=( training_nonsensitive, training_sensitive, ytrain_nonsensitive[:, 0], ytrain_sensitive[:, 0], flags.clusters, flags.Ax, flags.Ay, flags.Az, print_interval, verbose, ), bounds=bnd, approx_grad=True, maxfun=flags.maxfun, maxiter=flags.max_iter, disp=verbose, )[0] w = learned_model[: flags.clusters] prototypes = learned_model[flags.clusters :].reshape((flags.clusters, features_dim)) testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy() testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy() train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train) test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test) return ( DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name), TestTuple(x=test_transformed, s=test.s, name=test.name), )
def __init__(self, data: DataTuple): super().__init__() test = data.remove_y() self.x, self.s, self.num, self.xdim, self.sdim, self.x_names, self.s_names = _get_info( test) self.y = data.y.to_numpy(dtype=np.float32) self.ydim = data.y.shape[1] self.y_names = data.y.columns
def query_dt(datatup: DataTuple, query_str: str) -> DataTuple: """Query a datatuple.""" assert isinstance(query_str, str) assert isinstance(datatup, DataTuple) def _query_func(joined_data_frame: pd.DataFrame) -> pd.DataFrame: return dataset_from_cond(joined_data_frame, cond=query_str) return datatup.apply_to_joined_df(_query_func)
def __call__( self, data: DataTuple, split_id: int = 0 ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]: del split_id train_len = round(self.train_percentage * len(data)) train = data.apply_to_joined_df( lambda df: df.iloc[:train_len].reset_index(drop=True)) train = train.replace(name=f"{data.name} - Train") test = data.apply_to_joined_df( lambda df: df.iloc[train_len:].reset_index(drop=True)) test = test.replace(name=f"{data.name} - Test") assert len(train) + len(test) == len(data) return train, test, {}
def transform(data: T, prototypes: np.ndarray, w: np.ndarray) -> T: """Transform.""" sens_col = data.s.columns[0] data_sens = data.x.loc[data.s[sens_col] == 0].to_numpy() data_nons = data.x.loc[data.s[sens_col] == 1].to_numpy() transformed = trans(prototypes, w, data_nons, data_sens, data) if isinstance(data, DataTuple): return DataTuple(x=transformed, s=data.s, y=data.y, name=data.name) elif isinstance(data, TestTuple): return TestTuple(x=transformed, s=data.s, name=data.name)
def fit(self: _IA, train: DataTuple) -> _IA: """Fit algorithm on the given data asynchronously. Args: train: training data test: test data Returns: predictions """ self.model_path = self.model_dir / f"model_{self.name}.joblib" with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) train_path = tmp_path / "train.npz" train.to_npz(train_path) cmd = self._fit_script_command(train_path, self.model_path) self._call_script(cmd + ["--mode", "fit"]) # wait for script to run return self
def main() -> None: """This function runs the Agarwal model as a standalone program.""" args: AgarwalArgs = AgarwalArgs().parse_args() random.seed(args.seed) np.random.seed(args.seed) try: import cloudpickle # Need to install cloudpickle for now. See https://github.com/fairlearn/fairlearn/issues/569 except ImportError as e: raise RuntimeError( "In order to use Agarwal, install fairlearn and cloudpickle." ) from e if args.mode == "run": assert args.train is not None assert args.test is not None assert args.predictions is not None train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz( Path(args.test)) Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz( Path(args.predictions)) elif args.mode == "fit": assert args.train is not None assert args.model is not None data = DataTuple.from_npz(Path(args.train)) model = fit(data, args) with working_dir(Path(args.model)): model_file = cloudpickle.dumps(model) dump(model_file, Path(args.model)) elif args.mode == "predict": assert args.model is not None assert args.predictions is not None assert args.test is not None data = TestTuple.from_npz(Path(args.test)) model_file = load(Path(args.model)) with working_dir(Path(args.model)): model = cloudpickle.loads(model_file) Prediction(hard=predict(model, data)["preds"]).to_npz( Path(args.predictions)) else: raise RuntimeError(f"Unknown mode: {args.mode}")
async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: """Run Algorithm on the given data asynchronously. Args: train: training data test: test data Returns: predictions """ with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) train_path = tmp_path / "train.npz" test_path = tmp_path / "test.npz" pred_path = tmp_path / "predictions.npz" train.to_npz(train_path) test.to_npz(test_path) cmd = self._script_command(train_path, test_path, pred_path) await self._call_script(cmd) # wait for scrip to run return Prediction.from_npz(pred_path)
def adjust(self, dataset: DataTuple) -> DataTuple: """Take a datatuple and make the labels [0,1].""" y_col = dataset.y.columns[0] assert dataset.y[y_col].nunique() == 2 # make copy of dataset dataset = dataset.replace(y=dataset.y.copy()) self.min_val = dataset.y.to_numpy().min().item() self.max_val = dataset.y.to_numpy().max().item() y_col = dataset.y.columns[0] dataset.y[y_col] = dataset.y[y_col].replace(self.min_val, 0) dataset.y[y_col] = dataset.y[y_col].replace(self.max_val, 1) return DataTuple(x=dataset.x, s=dataset.s, y=dataset.y, name=dataset.name)
def run(self, train: DataTuple, test: TestTuple): seed = 42 np.random.seed(seed) # cpu vars torch.manual_seed(seed) # cpu vars random.seed(seed) # Python in_dim = train.x.shape[1] if self.use_s: train = train.make_copy_with(x=pd.concat([train.x, train.s], axis="columns")) test = test.make_copy_with(x=pd.concat([test.x, test.s], axis="columns")) in_dim += 1 train_ds = CustomDataset(train) test_ds = TestDataset(test) train_ds = DataLoader(train_ds, batch_size=self.batch_size, pin_memory=True, shuffle=True) test_ds = DataLoader(test_ds, batch_size=10000, pin_memory=True) if self.fair: debiasing_args = self.debiasing_args if debiasing_args.biased_acceptance_s0 is None: biased_acceptance_s0 = float( train.y[train.y.columns[0]].loc[train.s[train.s.columns[0]] == 0].mean() ) debiasing_args = debiasing_args._replace(biased_acceptance_s0=biased_acceptance_s0) if debiasing_args.biased_acceptance_s1 is None: biased_acceptance_s1 = float( train.y[train.y.columns[0]].loc[train.s[train.s.columns[0]] == 1].mean() ) debiasing_args = debiasing_args._replace(biased_acceptance_s1=biased_acceptance_s1) # print(debiasing_args) if isinstance(debiasing_args, DPFlags): self.debiasing_params = debiasing_params_target_rate(debiasing_args) else: self.debiasing_params = debiasing_params_target_tpr(debiasing_args) model = nn.Linear(in_dim, 1) model.to(self.device) optimizer: Optimizer if self.use_sgd: optimizer = SGD( model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay ) else: optimizer = RAdam( model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay ) self._fit( model=model, train_data=train_ds, optimizer=optimizer, # lr_milestones=dict(milestones=[30, 60, 90, 120], gamma=0.3), ) predictions = self.predict_dataset(model, test_ds) return pd.DataFrame(predictions.numpy(), columns=["preds"])
def scale_continuous(dataset: Dataset, datatuple: DataTuple, scaler: ScalerType, inverse: bool = False) -> Tuple[DataTuple, ScalerType]: """Use a scaler on just the continuous features.""" new_feats = datatuple.x.copy().astype('float64') if inverse: new_feats[dataset.continuous_features] = scaler.inverse_transform( new_feats[dataset.continuous_features]) else: new_feats[dataset.continuous_features] = scaler.fit_transform( new_feats[dataset.continuous_features]) return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
async def run_async(self, train: DataTuple, test: TestTuple) -> Tuple[DataTuple, TestTuple]: """Generate fair features with the given data asynchronously. Args: train: training data test: test data Returns: a tuple of the pre-processed training data and the test data """ with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) # ================================ write data to files ================================ train_path, test_path = tmp_path / "train.npz", tmp_path / "test.npz" train.to_npz(train_path) test.to_npz(test_path) # ========================== generate commandline arguments =========================== transformed_train_path = tmp_path / "transformed_train.npz" transformed_test_path = tmp_path / "transformed_test.npz" cmd = self._script_command(train_path, test_path, transformed_train_path, transformed_test_path) # ============================= run the generated command ============================= await self._call_script(cmd) # ================================== load results ===================================== transformed_train = DataTuple.from_npz(transformed_train_path) transformed_test = TestTuple.from_npz(transformed_test_path) # prefix the name of the algorithm to the dataset name transformed_train = transformed_train.replace( name=None if train.name is None else f"{self.name}: {train.name}") transformed_test = transformed_test.replace( name=None if test.name is None else f"{self.name}: {test.name}") return transformed_train, transformed_test
def encode_dataset(enc: nn.Module, dataloader: torch.utils.data.DataLoader, datatuple: DataTuple) -> DataTuple: """Encode a dataset.""" data_to_return: List[Any] = [] for embedding, _, _ in dataloader: data_to_return += enc(embedding).data.numpy().tolist() return DataTuple( x=pd.DataFrame(data_to_return), s=datatuple.s, y=datatuple.y, name=f"Beutel: {datatuple.name}", )
def metric_per_sensitive_attribute( prediction: Prediction, actual: DataTuple, metric: Metric, use_sens_name: bool = True) -> Dict[str, float]: """Compute a metric repeatedly on subsets of the data that share a senstitive attribute.""" if not metric.apply_per_sensitive: raise MetricNotApplicable( f"Metric {metric.name} is not applicable per sensitive " f"attribute, apply to whole dataset instead") assert actual.s.shape[0] == actual.x.shape[0] assert actual.s.shape[0] == actual.y.shape[0] assert prediction.hard.shape[0] == actual.y.shape[0] per_sensitive_attr: Dict[str, float] = {} s_columns: List[str] = list(actual.s.columns) y_columns: List[str] = list(actual.y.columns) assert len(y_columns) == 1 for y_col in y_columns: for s_col in s_columns: for unique_s in actual.s[s_col].unique(): mask: pd.Series = actual.s[s_col] == unique_s subset = DataTuple( x=pd.DataFrame( actual.x.loc[mask][actual.x.columns], columns=actual.x.columns).reset_index(drop=True), s=pd.DataFrame(actual.s.loc[mask][s_col], columns=[s_col]).reset_index(drop=True), y=pd.DataFrame(actual.y.loc[mask][y_col], columns=[y_col]).reset_index(drop=True), name=actual.name, ) pred_y: Prediction if isinstance(prediction, SoftPrediction): pred_y = SoftPrediction( soft=prediction.soft.loc[mask].reset_index(drop=True), info=prediction.info) else: pred_y = Prediction( hard=prediction.hard.loc[mask].reset_index(drop=True), info=prediction.info) key = (s_col if use_sens_name else "S") + "_" + str(unique_s) per_sensitive_attr[key] = metric.score(pred_y, subset) return per_sensitive_attr
def scale_continuous( dataset: Dataset, datatuple: DataTuple, scaler: ScalerType, inverse: bool = False, fit: bool = True, ) -> Tuple[DataTuple, ScalerType]: """Use a scaler on just the continuous features. Args: dataset: Dataset object. Used to find the continuous features. datatuple: DataTuple on which to sclae the continuous features. scaler: Scaler object to scale the features. Must fit the SKLearn scaler API. inverse: Should the scaling be reversed? fit: If not `inverse`, should the scaler be fit to the data? If `True`, do `fit_transform` operation, else just `transform`. Returns: Tuple of (scaled) DataTuple, and the Scaler (which may have been fit to the data). Examples: >>> dataset = adult() >>> datatuple = dataset.load() >>> train, test = train_test_split(datatuple) >>> train, scaler = scale_continuous(dataset, train, scaler) >>> test, scaler = scale_continuous(dataset, test, scaler, fit=False) """ new_feats = datatuple.x.copy().astype('float64') if inverse: new_feats[dataset.continuous_features] = scaler.inverse_transform( new_feats[dataset.continuous_features]) elif fit: new_feats[dataset.continuous_features] = scaler.fit_transform( new_feats[dataset.continuous_features]) else: new_feats[dataset.continuous_features] = scaler.transform( new_feats[dataset.continuous_features]) return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
def main() -> None: """LFR Model. Learning fair representations is a pre-processing technique that finds a latent representation which encodes the data well but obfuscates information about protected attributes [2]_. References: .. [2] R. Zemel, Y. Wu, K. Swersky, T. Pitassi, and C. Dwork, "Learning Fair Representations." International Conference on Machine Learning, 2013. Based on code from https://github.com/zjelveh/learning-fair-representations Which in turn, we've got from AIF360 """ args = ZemelArgs() args.parse_args() if args.mode == "run": assert args.train is not None assert args.new_train is not None assert args.test is not None assert args.new_test is not None train, test = load_data_from_flags(args) save_transformations(train_and_transform(train, test, args), args) elif args.mode == "fit": assert args.model is not None assert args.train is not None assert args.new_train is not None train = DataTuple.from_npz(Path(args.train)) model = fit(train, args) sens_col = train.s.columns[0] training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() train_transformed = trans(model.prototypes, model.w, training_nonsensitive, training_sensitive, train) data = DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name) data.to_npz(Path(args.new_train)) dump(model, Path(args.model)) elif args.mode == "transform": assert args.model is not None assert args.test is not None assert args.new_test is not None test = DataTuple.from_npz(Path(args.test)) model = load(Path(args.model)) transformed_test = transform(test, model.prototypes, model.w) transformed_test.to_npz(Path(args.new_test))
def train_and_transform(train: DataTuple, test: TestTuple, flags: ZemelArgs) -> (Tuple[DataTuple, TestTuple]): """Train and transform.""" prototypes, w = fit(train, flags) sens_col = train.s.columns[0] training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy() testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy() train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train) test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test) return ( DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name), TestTuple(x=test_transformed, s=test.s, name=test.name), )
def bin_cont_feats(data: DataTuple) -> DataTuple: """Bin the continuous fetures. Given a datatuple, bin the columns that have ordinal features and return as afresh new DataTuple. """ groups: List[List[str]] = [ list(group) for _, group in groupby(data.x.columns, lambda x: x.split("_")[0]) ] copy: pd.DataFrame = data.x.copy() for group in groups: # if there is only one element in the group, then it corresponds to a continuous feature if len(group) == 1 and data.x[group[0]].nunique() > 2: copy[group] = pd.cut(data.x[group].to_numpy()[:, 0], 5) copy = pd.concat([copy, pd.get_dummies(copy[group])], axis="columns") copy = copy.drop(group, axis="columns") return data.replace(x=copy)
def main() -> None: """This function runs the FWD model as a standalone program on tabular data.""" args = DroArgs().parse_args() if args.mode == "run": assert args.train is not None assert args.test is not None assert args.predictions is not None train, test = load_data_from_flags(args) train_and_predict(train, test, args).to_npz(Path(args.predictions)) elif args.mode == "fit": assert args.train is not None assert args.model is not None data = DataTuple.from_npz(Path(args.train)) model = fit(data, args) dump(model, Path(args.model)) elif args.mode == "predict": assert args.model is not None assert args.predictions is not None assert args.test is not None data = TestTuple.from_npz(Path(args.test)) model = load(Path(args.model)) predict(model, data, args).to_npz(Path(args.predictions))
def concat_datatuples(first_dt: DataTuple, second_dt: DataTuple) -> DataTuple: """Given 2 datatuples, concatenate them and shuffle.""" assert (first_dt.x.columns == second_dt.x.columns).all() assert (first_dt.s.columns == second_dt.s.columns).all() assert (first_dt.y.columns == second_dt.y.columns).all() x_columns: pd.Index = first_dt.x.columns s_columns: pd.Index = first_dt.s.columns y_columns: pd.Index = first_dt.y.columns a_combined: pd.DataFrame = pd.concat([first_dt.x, first_dt.s, first_dt.y], axis="columns") b_combined: pd.DataFrame = pd.concat( [second_dt.x, second_dt.s, second_dt.y], axis="columns") combined: pd.DataFrame = pd.concat([a_combined, b_combined], axis="index") combined = combined.sample(frac=1.0, random_state=1).reset_index(drop=True) return DataTuple(x=combined[x_columns], s=combined[s_columns], y=combined[y_columns], name=first_dt.name)
def upsample( dataset: DataTuple, test: TestTuple, strategy: Literal["uniform", "preferential", "naive"] ) -> Tuple[DataTuple, TestTuple]: """Upsample a datatuple.""" s_col = dataset.s.columns[0] y_col = dataset.y.columns[0] s_vals: List[int] = list(map(int, dataset.s[s_col].unique())) y_vals: List[int] = list(map(int, dataset.y[y_col].unique())) groups = itertools.product(s_vals, y_vals) data: Dict[Tuple[int, int], DataTuple] = {} for s, y in groups: s_y_mask = (dataset.s[s_col] == s) & (dataset.y[y_col] == y) data[(s, y)] = DataTuple( x=dataset.x.loc[s_y_mask].reset_index(drop=True), s=dataset.s.loc[s_y_mask].reset_index(drop=True), y=dataset.y.loc[s_y_mask].reset_index(drop=True), name=dataset.name, ) percentages: Dict[Tuple[int, int], float] = {} vals: List[int] = [] for key, val in data.items(): vals.append(val.x.shape[0]) for key, val in data.items(): if strategy == "naive": percentages[key] = max(vals) / val.x.shape[0] else: s_val: int = key[0] y_val: int = key[1] y_eq_y = dataset.y.loc[dataset.y[y_col] == y_val].count().to_numpy()[0] s_eq_s = dataset.s.loc[dataset.s[s_col] == s_val].count().to_numpy()[0] num_samples = dataset.y.count().to_numpy()[0] num_batch = val.y.count().to_numpy()[0] percentages[key] = round( (y_eq_y * s_eq_s / (num_batch * num_samples)), 8) x_columns: pd.Index = dataset.x.columns s_columns: pd.Index = dataset.s.columns y_columns: pd.Index = dataset.y.columns upsampled: Dict[Tuple[int, int], DataTuple] = {} for key, val in data.items(): all_data: pd.DataFrame = pd.concat([val.x, val.s, val.y], axis="columns") all_data = all_data.sample(frac=percentages[key], random_state=1, replace=True).reset_index(drop=True) upsampled[key] = DataTuple(x=all_data[x_columns], s=all_data[s_columns], y=all_data[y_columns], name=dataset.name) upsampled_datatuple: Optional[DataTuple] = None for key, val in upsampled.items(): if upsampled_datatuple is None: upsampled_datatuple = val else: upsampled_datatuple = concat_datatuples(upsampled_datatuple, val) if strategy == "preferential": ranker = LRProb() rank: SoftPrediction = ranker.run(dataset, dataset) selected: List[pd.DataFrame] = [] all_data = pd.concat([dataset.x, dataset.s, dataset.y], axis="columns") all_data = pd.concat( [all_data, pd.DataFrame(rank.soft, columns=["preds"])], axis="columns") for key, val in data.items(): s_val = key[0] y_val = key[1] s_y_mask = (dataset.s[s_col] == s_val) & (dataset.y[y_col] == y_val) ascending = False if s_val <= 0: ascending = True if percentages[key] > 1.0: selected.append(all_data.loc[s_y_mask]) percentages[key] -= 1.0 weight = all_data.loc[s_y_mask][y_col].count() selected.append(all_data.loc[s_y_mask].sort_values( by=["preds"], ascending=ascending).iloc[:int(percentages[key] * weight)]) upsampled_dataframes: pd.DataFrame for i, df in enumerate(selected): if i == 0: upsampled_dataframes = df.drop(["preds"], axis="columns") else: upsampled_dataframes = pd.concat( [upsampled_dataframes, df.drop(["preds"], axis="columns")], axis="index").reset_index(drop=True) upsampled_datatuple = DataTuple( x=upsampled_dataframes[x_columns], s=upsampled_dataframes[s_columns], y=upsampled_dataframes[y_columns], name=dataset.name, ) assert upsampled_datatuple is not None return upsampled_datatuple, TestTuple(x=test.x, s=test.s, name=test.name)