def __call__( self, data: DataTuple, split_id: int = 0 ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]: random_seed = self._get_seed(split_id) train_indexes, test_indexes = generate_proportional_split_indexes( data, train_percentage=self.train_percentage, random_seed=random_seed) train: DataTuple = DataTuple( x=data.x.iloc[train_indexes].reset_index(drop=True), s=data.s.iloc[train_indexes].reset_index(drop=True), y=data.y.iloc[train_indexes].reset_index(drop=True), name=f"{data.name} - Train", ) test: DataTuple = DataTuple( x=data.x.iloc[test_indexes].reset_index(drop=True), s=data.s.iloc[test_indexes].reset_index(drop=True), y=data.y.iloc[test_indexes].reset_index(drop=True), name=f"{data.name} - Test", ) # assert that no data points got lost anywhere assert len(data) == len(train) + len(test) split_info: Dict[str, float] = {"seed": random_seed} return train, test, split_info
def domain_split(datatup: DataTuple, tr_cond: str, te_cond: str, seed: int = 888) -> Tuple[DataTuple, DataTuple]: """Splits a datatuple based on a condition. Args: datatup: DataTuple tr_cond: condition for the training set te_cond: condition for the test set Returns: Tuple of DataTuple split into train and test. The test is all those that meet the test condition plus the same percentage again of the train set. """ dataset = datatup.x train_dataset = dataset_from_cond(dataset, tr_cond) test_dataset = dataset_from_cond(dataset, te_cond) assert train_dataset.shape[0] + test_dataset.shape[0] == dataset.shape[0] test_pct = test_dataset.shape[0] / dataset.shape[0] train_pct = 1 - test_pct train_train_pcnt = (1 - (test_pct * 2)) / train_pct train_train = train_dataset.sample(frac=train_train_pcnt, random_state=seed) test_train = train_dataset.drop(train_train.index, axis="index") # type: ignore[arg-type] test = pd.concat([test_train, test_dataset], axis="index") train_x = datatup.x.iloc[train_train.index].reset_index( drop=True) # type: ignore[call-overload] train_s = datatup.s.iloc[train_train.index].reset_index( drop=True) # type: ignore[call-overload] train_y = datatup.y.iloc[train_train.index].reset_index( drop=True) # type: ignore[call-overload] train_datatup = DataTuple(x=train_x, s=train_s, y=train_y, name=datatup.name) test_x = datatup.x.iloc[test.index].reset_index( drop=True) # type: ignore[call-overload] test_s = datatup.s.iloc[test.index].reset_index( drop=True) # type: ignore[call-overload] test_y = datatup.y.iloc[test.index].reset_index( drop=True) # type: ignore[call-overload] test_datatup = DataTuple(x=test_x, s=test_s, y=test_y, name=datatup.name) return train_datatup, test_datatup
def fold_data(data: DataTuple, folds: int) -> Iterator[Tuple[DataTuple, DataTuple]]: """So much love to sklearn for making their source code open.""" indices: np.ndarray = np.arange(data.x.shape[0]) fold_sizes: np.ndarray = np.full(folds, data.x.shape[0] // folds, dtype=np.int32) fold_sizes[:data.x.shape[0] % folds] += np.int32(1) current = 0 for i, fold_size in enumerate(fold_sizes): start, stop = current, int(current + fold_size) val_inds: np.ndarray = indices[start:stop] train_inds = np.array([i for i in indices if i not in val_inds]) # probably inefficient train_x = data.x.iloc[train_inds].reset_index( drop=True) # type: ignore[call-overload] train_s = data.s.iloc[train_inds].reset_index( drop=True) # type: ignore[call-overload] train_y = data.y.iloc[train_inds].reset_index( drop=True) # type: ignore[call-overload] assert train_x.shape == (len(train_inds), data.x.shape[1]) assert train_s.shape == (len(train_inds), data.s.shape[1]) assert train_y.shape == (len(train_inds), data.y.shape[1]) val_x = data.x.iloc[val_inds].reset_index( drop=True) # type: ignore[call-overload] val_s = data.s.iloc[val_inds].reset_index( drop=True) # type: ignore[call-overload] val_y = data.y.iloc[val_inds].reset_index( drop=True) # type: ignore[call-overload] assert val_x.shape == (len(val_inds), data.x.shape[1]) assert val_s.shape == (len(val_inds), data.s.shape[1]) assert val_y.shape == (len(val_inds), data.y.shape[1]) yield DataTuple(x=train_x, s=train_s, y=train_y, name=f"{data.name} - train fold {i}"), DataTuple( x=val_x, s=val_s, y=val_y, name=f"{data.name} - test fold {i}") current = stop
def train_and_transform(train: DataTuple, test: TestTuple, flags: VfaeArgs) -> Tuple[DataTuple, TestTuple]: """Train the model and transform the dataset. Args: train: test: flags: Returns: Tuple of Encoded Train Dataset and Test Dataset. """ dataset = get_dataset_obj_by_name(flags.dataset)() # Set up the data train_data = CustomDataset(train) train_loader = DataLoader(train_data, batch_size=flags.batch_size) test_data = TestDataset(test) test_loader = DataLoader(test_data, batch_size=flags.batch_size) # Build Network model = VFAENetwork( dataset, flags.supervised, train_data.xdim, latent_dims=50, z1_enc_size=flags.z1_enc_size, z2_enc_size=flags.z2_enc_size, z1_dec_size=flags.z1_dec_size, ).to("cpu") optimizer = optim.Adam(model.parameters(), lr=1e-3) # Run Network for epoch in range(int(flags.epochs)): train_model(epoch, model, train_loader, optimizer, flags) # Transform output post_train: List[List[float]] = [] post_test: List[List[float]] = [] model.eval() with torch.no_grad(): for _x, _s, _ in train_loader: z1_mu, z1_logvar = model.encode_z1(_x, _s) z1 = model.reparameterize(z1_mu, z1_logvar) post_train += z1.data.tolist() for _x, _s in test_loader: z1_mu, z1_logvar = model.encode_z1(_x, _s) z1 = model.reparameterize(z1_mu, z1_logvar) post_test += z1.data.tolist() return ( DataTuple(x=pd.DataFrame(post_train), s=train.s, y=train.y, name=f"VFAE: {train.name}"), TestTuple(x=pd.DataFrame(post_test), s=test.s, name=f"VFAE: {test.name}"), )
def transform(model: VFAENetwork, dataset: T, flags) -> T: """Transform the dataset.""" data: Union[CustomDataset, TestDataset] if isinstance(dataset, DataTuple): data = CustomDataset(dataset) loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False) elif isinstance(dataset, TestTuple): data = TestDataset(dataset) loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False) post_train: List[List[float]] = [] model.eval() with torch.no_grad(): for sample in loader: if isinstance(dataset, DataTuple): _x, _s, _ = sample elif isinstance(dataset, TestTuple): _x, _s = sample z1_mu, z1_logvar = model.encode_z1(_x, _s) # z1 = model.reparameterize(z1_mu, z1_logvar) post_train += z1_mu.data.tolist() if isinstance(dataset, DataTuple): return DataTuple(x=pd.DataFrame(post_train), s=dataset.s, y=dataset.y, name=f"VFAE: {dataset.name}") elif isinstance(dataset, TestTuple): return TestTuple(x=pd.DataFrame(post_train), s=dataset.s, name=f"VFAE: {dataset.name}")
def train_and_transform( train: DataTuple, test: TestTuple, flags: ZemelArgs ) -> (Tuple[DataTuple, TestTuple]): """Train the Zemel model and return the transformed features of the train and test sets.""" np.random.seed(flags.seed) sens_col = train.s.columns[0] training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() ytrain_sensitive = train.y.loc[train.s[sens_col] == 0].to_numpy() ytrain_nonsensitive = train.y.loc[train.s[sens_col] == 1].to_numpy() print_interval = 100 verbose = False num_train_samples, features_dim = train.x.shape # Initialize the LFR optim objective parameters parameters_initialization = np.random.uniform( size=flags.clusters + features_dim * flags.clusters ) bnd = [(0, 1)] * flags.clusters + [(None, None)] * features_dim * flags.clusters # type: ignore[operator] LFR_optim_objective.steps = 0 # type: ignore[attr-defined] learned_model = optim.fmin_l_bfgs_b( LFR_optim_objective, x0=parameters_initialization, epsilon=1e-5, args=( training_nonsensitive, training_sensitive, ytrain_nonsensitive[:, 0], ytrain_sensitive[:, 0], flags.clusters, flags.Ax, flags.Ay, flags.Az, print_interval, verbose, ), bounds=bnd, approx_grad=True, maxfun=flags.maxfun, maxiter=flags.max_iter, disp=verbose, )[0] w = learned_model[: flags.clusters] prototypes = learned_model[flags.clusters :].reshape((flags.clusters, features_dim)) testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy() testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy() train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train) test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test) return ( DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name), TestTuple(x=test_transformed, s=test.s, name=test.name), )
def transform(data: T, prototypes: np.ndarray, w: np.ndarray) -> T: """Transform.""" sens_col = data.s.columns[0] data_sens = data.x.loc[data.s[sens_col] == 0].to_numpy() data_nons = data.x.loc[data.s[sens_col] == 1].to_numpy() transformed = trans(prototypes, w, data_nons, data_sens, data) if isinstance(data, DataTuple): return DataTuple(x=transformed, s=data.s, y=data.y, name=data.name) elif isinstance(data, TestTuple): return TestTuple(x=transformed, s=data.s, name=data.name)
def scale_continuous(dataset: Dataset, datatuple: DataTuple, scaler: ScalerType, inverse: bool = False) -> Tuple[DataTuple, ScalerType]: """Use a scaler on just the continuous features.""" new_feats = datatuple.x.copy().astype('float64') if inverse: new_feats[dataset.continuous_features] = scaler.inverse_transform( new_feats[dataset.continuous_features]) else: new_feats[dataset.continuous_features] = scaler.fit_transform( new_feats[dataset.continuous_features]) return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
def encode_dataset(enc: nn.Module, dataloader: torch.utils.data.DataLoader, datatuple: DataTuple) -> DataTuple: """Encode a dataset.""" data_to_return: List[Any] = [] for embedding, _, _ in dataloader: data_to_return += enc(embedding).data.numpy().tolist() return DataTuple( x=pd.DataFrame(data_to_return), s=datatuple.s, y=datatuple.y, name=f"Beutel: {datatuple.name}", )
def main() -> None: """LFR Model. Learning fair representations is a pre-processing technique that finds a latent representation which encodes the data well but obfuscates information about protected attributes [2]_. References: .. [2] R. Zemel, Y. Wu, K. Swersky, T. Pitassi, and C. Dwork, "Learning Fair Representations." International Conference on Machine Learning, 2013. Based on code from https://github.com/zjelveh/learning-fair-representations Which in turn, we've got from AIF360 """ args = ZemelArgs() args.parse_args() if args.mode == "run": assert args.train is not None assert args.new_train is not None assert args.test is not None assert args.new_test is not None train, test = load_data_from_flags(args) save_transformations(train_and_transform(train, test, args), args) elif args.mode == "fit": assert args.model is not None assert args.train is not None assert args.new_train is not None train = DataTuple.from_npz(Path(args.train)) model = fit(train, args) sens_col = train.s.columns[0] training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() train_transformed = trans(model.prototypes, model.w, training_nonsensitive, training_sensitive, train) data = DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name) data.to_npz(Path(args.new_train)) dump(model, Path(args.model)) elif args.mode == "transform": assert args.model is not None assert args.test is not None assert args.new_test is not None test = DataTuple.from_npz(Path(args.test)) model = load(Path(args.model)) transformed_test = transform(test, model.prototypes, model.w) transformed_test.to_npz(Path(args.new_test))
def metric_per_sensitive_attribute( prediction: Prediction, actual: DataTuple, metric: Metric, use_sens_name: bool = True) -> Dict[str, float]: """Compute a metric repeatedly on subsets of the data that share a senstitive attribute.""" if not metric.apply_per_sensitive: raise MetricNotApplicable( f"Metric {metric.name} is not applicable per sensitive " f"attribute, apply to whole dataset instead") assert actual.s.shape[0] == actual.x.shape[0] assert actual.s.shape[0] == actual.y.shape[0] assert prediction.hard.shape[0] == actual.y.shape[0] per_sensitive_attr: Dict[str, float] = {} s_columns: List[str] = list(actual.s.columns) y_columns: List[str] = list(actual.y.columns) assert len(y_columns) == 1 for y_col in y_columns: for s_col in s_columns: for unique_s in actual.s[s_col].unique(): mask: pd.Series = actual.s[s_col] == unique_s subset = DataTuple( x=pd.DataFrame( actual.x.loc[mask][actual.x.columns], columns=actual.x.columns).reset_index(drop=True), s=pd.DataFrame(actual.s.loc[mask][s_col], columns=[s_col]).reset_index(drop=True), y=pd.DataFrame(actual.y.loc[mask][y_col], columns=[y_col]).reset_index(drop=True), name=actual.name, ) pred_y: Prediction if isinstance(prediction, SoftPrediction): pred_y = SoftPrediction( soft=prediction.soft.loc[mask].reset_index(drop=True), info=prediction.info) else: pred_y = Prediction( hard=prediction.hard.loc[mask].reset_index(drop=True), info=prediction.info) key = (s_col if use_sens_name else "S") + "_" + str(unique_s) per_sensitive_attr[key] = metric.score(pred_y, subset) return per_sensitive_attr
def scale_continuous( dataset: Dataset, datatuple: DataTuple, scaler: ScalerType, inverse: bool = False, fit: bool = True, ) -> Tuple[DataTuple, ScalerType]: """Use a scaler on just the continuous features. Args: dataset: Dataset object. Used to find the continuous features. datatuple: DataTuple on which to sclae the continuous features. scaler: Scaler object to scale the features. Must fit the SKLearn scaler API. inverse: Should the scaling be reversed? fit: If not `inverse`, should the scaler be fit to the data? If `True`, do `fit_transform` operation, else just `transform`. Returns: Tuple of (scaled) DataTuple, and the Scaler (which may have been fit to the data). Examples: >>> dataset = adult() >>> datatuple = dataset.load() >>> train, test = train_test_split(datatuple) >>> train, scaler = scale_continuous(dataset, train, scaler) >>> test, scaler = scale_continuous(dataset, test, scaler, fit=False) """ new_feats = datatuple.x.copy().astype('float64') if inverse: new_feats[dataset.continuous_features] = scaler.inverse_transform( new_feats[dataset.continuous_features]) elif fit: new_feats[dataset.continuous_features] = scaler.fit_transform( new_feats[dataset.continuous_features]) else: new_feats[dataset.continuous_features] = scaler.transform( new_feats[dataset.continuous_features]) return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
def adjust(self, dataset: DataTuple) -> DataTuple: """Take a datatuple and make the labels [0,1].""" y_col = dataset.y.columns[0] assert dataset.y[y_col].nunique() == 2 # make copy of dataset dataset = dataset.replace(y=dataset.y.copy()) self.min_val = dataset.y.to_numpy().min().item() self.max_val = dataset.y.to_numpy().max().item() y_col = dataset.y.columns[0] dataset.y[y_col] = dataset.y[y_col].replace(self.min_val, 0) dataset.y[y_col] = dataset.y[y_col].replace(self.max_val, 1) return DataTuple(x=dataset.x, s=dataset.s, y=dataset.y, name=dataset.name)
def train_and_transform(train: DataTuple, test: TestTuple, flags: ZemelArgs) -> (Tuple[DataTuple, TestTuple]): """Train and transform.""" prototypes, w = fit(train, flags) sens_col = train.s.columns[0] training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy() training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy() testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy() testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy() train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train) test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test) return ( DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name), TestTuple(x=test_transformed, s=test.s, name=test.name), )
def concat_datatuples(first_dt: DataTuple, second_dt: DataTuple) -> DataTuple: """Given 2 datatuples, concatenate them and shuffle.""" assert (first_dt.x.columns == second_dt.x.columns).all() assert (first_dt.s.columns == second_dt.s.columns).all() assert (first_dt.y.columns == second_dt.y.columns).all() x_columns: pd.Index = first_dt.x.columns s_columns: pd.Index = first_dt.s.columns y_columns: pd.Index = first_dt.y.columns a_combined: pd.DataFrame = pd.concat([first_dt.x, first_dt.s, first_dt.y], axis="columns") b_combined: pd.DataFrame = pd.concat( [second_dt.x, second_dt.s, second_dt.y], axis="columns") combined: pd.DataFrame = pd.concat([a_combined, b_combined], axis="index") combined = combined.sample(frac=1.0, random_state=1).reset_index(drop=True) return DataTuple(x=combined[x_columns], s=combined[s_columns], y=combined[y_columns], name=first_dt.name)
def _backend_load( self, dataframe: pd.DataFrame, *, labels_as_features: bool, ordered: bool ) -> DataTuple: # +++ BELOW HERE IS A COPY OF DATASET LOAD +++ assert isinstance(dataframe, pd.DataFrame) feature_split = self.feature_split if not ordered else self.ordered_features if labels_as_features: feature_split_x = feature_split["x"] + feature_split["s"] + feature_split["y"] else: feature_split_x = feature_split["x"] # ========================================================================================= # Check whether we have to generate some complementary columns for binary features. # This happens when we have for example several races: race-asian-pac-islander etc, but we # want to have a an attribute called "race_other" that summarizes them all. Now the problem # is that this cannot be done before this point, because only here have we actually loaded # the data. So, we have to do it here, with all the information we can piece together. disc_feature_groups = self._discrete_feature_groups if disc_feature_groups is not None: for group in disc_feature_groups.values(): if len(group) == 1: continue for feature in group: if feature in dataframe.columns: continue # nothing to do missing_feature = feature existing_features = [other for other in group if other in dataframe.columns] assert len(existing_features) == len(group) - 1, "at most 1 feature missing" # the dummy feature is the inverse of the existing feature or_combination = dataframe[existing_features[0]] == 1 for other in existing_features[1:]: or_combination |= dataframe[other] == 1 inverse: pd.Series = 1 - or_combination dataframe = pd.concat( [dataframe, inverse.to_frame(name=missing_feature)], axis="columns" ) # ========================================================================================= x_data = dataframe[feature_split_x] s_data = dataframe[feature_split["s"]] y_data = dataframe[feature_split["y"]] if self._map_to_binary: s_data = (s_data + 1) // 2 # map from {-1, 1} to {0, 1} y_data = (y_data + 1) // 2 # map from {-1, 1} to {0, 1} if self._invert_s: assert s_data.nunique().values[0] == 2, "s must be binary" s_data = 1 - s_data # the following operations remove rows if a label group is not properly one-hot encoded s_data, s_mask = self._maybe_combine_labels(s_data, label_type="s") if s_mask is not None: x_data = x_data.loc[s_mask].reset_index(drop=True) s_data = s_data.loc[s_mask].reset_index(drop=True) y_data = y_data.loc[s_mask].reset_index(drop=True) y_data, y_mask = self._maybe_combine_labels(y_data, label_type="y") if y_mask is not None: x_data = x_data.loc[y_mask].reset_index(drop=True) s_data = s_data.loc[y_mask].reset_index(drop=True) y_data = y_data.loc[y_mask].reset_index(drop=True) return DataTuple(x=x_data, s=s_data, y=y_data, name=self.name)
def _calders_algorithm( dataset: DataTuple, test: TestTuple, good_class: int, disadvantaged_group: int) -> Tuple[DataTuple, TestTuple]: s_col = dataset.s.columns[0] y_col = dataset.y.columns[0] s_vals: List[int] = list(map(int, dataset.s[s_col].unique())) y_vals: List[int] = list(map(int, dataset.y[y_col].unique())) assert len(s_vals) == 2 assert len(y_vals) == 2 s_0, s_1 = s_vals y_0, y_1 = y_vals bad_class = y_0 if good_class == y_1 else y_1 advantaged_group = s_0 if disadvantaged_group == s_1 else s_1 groups = ((s_0, y_0), (s_0, y_1), (s_1, y_0), (s_1, y_1)) data: Dict[Tuple[int, int], DataTuple] = {} for s, y in groups: s_y_mask = (dataset.s[s_col] == s) & (dataset.y[y_col] == y) data[(s, y)] = DataTuple( x=dataset.x.loc[s_y_mask].reset_index(drop=True), s=dataset.s.loc[s_y_mask].reset_index(drop=True), y=dataset.y.loc[s_y_mask].reset_index(drop=True), name=dataset.name, ) dis_group = (disadvantaged_group, bad_class) adv_group = (advantaged_group, good_class) massaging_candidates = concat_dt([data[dis_group], data[adv_group]]) ranker = LRProb() rank: SoftPrediction = ranker.run(dataset, massaging_candidates) dis_group_len = len(data[dis_group]) adv_group_len = len(data[adv_group]) dis_group_rank = rank.soft.iloc[:dis_group_len] adv_group_rank = rank.soft.iloc[dis_group_len:].reset_index(drop=True) assert len(adv_group_rank) == adv_group_len # sort the ranking dis_group_rank.sort_values(ascending=False, inplace=True) adv_group_rank.sort_values(inplace=True) # use the rank to sort the data for group, ranking in [(dis_group, dis_group_rank), (adv_group, adv_group_rank)]: unsorted_data = data[group] data[group] = DataTuple( x=unsorted_data.x.reindex(index=ranking.index).reset_index( drop=True), s=unsorted_data.s.reindex(index=ranking.index).reset_index( drop=True), y=unsorted_data.y.reindex(index=ranking.index).reset_index( drop=True), name=unsorted_data.name, ) all_disadvantaged = len( data[(disadvantaged_group, good_class)]) + dis_group_len all_advantaged = adv_group_len + len(data[(advantaged_group, bad_class)]) dis_group_good_len = all_disadvantaged - dis_group_len # ensure that the ratio of good_class to bad_class is the same in both groups. # for this, we have to swap some labels num_to_swap = round((adv_group_len * all_disadvantaged - dis_group_good_len * all_advantaged) / len(dataset)) data[dis_group].y.iloc[:num_to_swap] = good_class data[adv_group].y.iloc[:num_to_swap] = bad_class return concat_dt(list(data.values())), test
def upsample( dataset: DataTuple, test: TestTuple, strategy: Literal["uniform", "preferential", "naive"] ) -> Tuple[DataTuple, TestTuple]: """Upsample a datatuple.""" s_col = dataset.s.columns[0] y_col = dataset.y.columns[0] s_vals: List[int] = list(map(int, dataset.s[s_col].unique())) y_vals: List[int] = list(map(int, dataset.y[y_col].unique())) groups = itertools.product(s_vals, y_vals) data: Dict[Tuple[int, int], DataTuple] = {} for s, y in groups: s_y_mask = (dataset.s[s_col] == s) & (dataset.y[y_col] == y) data[(s, y)] = DataTuple( x=dataset.x.loc[s_y_mask].reset_index(drop=True), s=dataset.s.loc[s_y_mask].reset_index(drop=True), y=dataset.y.loc[s_y_mask].reset_index(drop=True), name=dataset.name, ) percentages: Dict[Tuple[int, int], float] = {} vals: List[int] = [] for key, val in data.items(): vals.append(val.x.shape[0]) for key, val in data.items(): if strategy == "naive": percentages[key] = max(vals) / val.x.shape[0] else: s_val: int = key[0] y_val: int = key[1] y_eq_y = dataset.y.loc[dataset.y[y_col] == y_val].count().to_numpy()[0] s_eq_s = dataset.s.loc[dataset.s[s_col] == s_val].count().to_numpy()[0] num_samples = dataset.y.count().to_numpy()[0] num_batch = val.y.count().to_numpy()[0] percentages[key] = round( (y_eq_y * s_eq_s / (num_batch * num_samples)), 8) x_columns: pd.Index = dataset.x.columns s_columns: pd.Index = dataset.s.columns y_columns: pd.Index = dataset.y.columns upsampled: Dict[Tuple[int, int], DataTuple] = {} for key, val in data.items(): all_data: pd.DataFrame = pd.concat([val.x, val.s, val.y], axis="columns") all_data = all_data.sample(frac=percentages[key], random_state=1, replace=True).reset_index(drop=True) upsampled[key] = DataTuple(x=all_data[x_columns], s=all_data[s_columns], y=all_data[y_columns], name=dataset.name) upsampled_datatuple: Optional[DataTuple] = None for key, val in upsampled.items(): if upsampled_datatuple is None: upsampled_datatuple = val else: upsampled_datatuple = concat_datatuples(upsampled_datatuple, val) if strategy == "preferential": ranker = LRProb() rank: SoftPrediction = ranker.run(dataset, dataset) selected: List[pd.DataFrame] = [] all_data = pd.concat([dataset.x, dataset.s, dataset.y], axis="columns") all_data = pd.concat( [all_data, pd.DataFrame(rank.soft, columns=["preds"])], axis="columns") for key, val in data.items(): s_val = key[0] y_val = key[1] s_y_mask = (dataset.s[s_col] == s_val) & (dataset.y[y_col] == y_val) ascending = False if s_val <= 0: ascending = True if percentages[key] > 1.0: selected.append(all_data.loc[s_y_mask]) percentages[key] -= 1.0 weight = all_data.loc[s_y_mask][y_col].count() selected.append(all_data.loc[s_y_mask].sort_values( by=["preds"], ascending=ascending).iloc[:int(percentages[key] * weight)]) upsampled_dataframes: pd.DataFrame for i, df in enumerate(selected): if i == 0: upsampled_dataframes = df.drop(["preds"], axis="columns") else: upsampled_dataframes = pd.concat( [upsampled_dataframes, df.drop(["preds"], axis="columns")], axis="index").reset_index(drop=True) upsampled_datatuple = DataTuple( x=upsampled_dataframes[x_columns], s=upsampled_dataframes[s_columns], y=upsampled_dataframes[y_columns], name=dataset.name, ) assert upsampled_datatuple is not None return upsampled_datatuple, TestTuple(x=test.x, s=test.s, name=test.name)
def __call__( self, data: DataTuple, split_id: int = 0 ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]: random_seed = self._get_seed(split_id) random = RandomState(seed=random_seed) s_col = data.s.columns[0] y_col = data.y.columns[0] s_vals: List[int] = list(map(int, data.s[s_col].unique())) y_vals: List[int] = list(map(int, data.y[y_col].unique())) train_indexes: List[np.ndarray] = [] test_indexes: List[np.ndarray] = [] num_test: Dict[Tuple[int, int], int] = {} # find out how many samples are available for the test set for s, y in itertools.product(s_vals, y_vals): # find all indices for this group idx = ((data.s[s_col] == s) & (data.y[y_col] == y)).to_numpy().nonzero()[0] # how many elements are in this "quadrant" quadrant_size = len(idx) # compute how many elements would be available for the test set num_test[(s, y)] = round(quadrant_size * (1 - self.train_percentage)) # compute how much we should take for the test set to make it balanced if self.balance_type == "P(s|y)=0.5": minimize_over_s = { y: min(num_test[(s, y)] for s in s_vals) for y in y_vals } num_test_balanced = {(s, y): minimize_over_s[y] for s in s_vals for y in y_vals} elif self.balance_type == "P(y|s)=0.5": minimize_over_y = { s: min(num_test[(s, y)] for y in y_vals) for s in s_vals } num_test_balanced = {(s, y): minimize_over_y[s] for s in s_vals for y in y_vals} elif self.balance_type == "P(s,y)=0.25": smallest_quadrant = min(num_test[(s, y)] for s in s_vals for y in y_vals) num_test_balanced = {(s, y): smallest_quadrant for s in s_vals for y in y_vals} else: raise ValueError("Unknown balance_type") num_dropped = 0 # iterate over all combinations of s and y for s, y in itertools.product(s_vals, y_vals): # find all indices for this group idx = ((data.s[s_col] == s) & (data.y[y_col] == y)).to_numpy().nonzero()[0] # shuffle and take subsets random.shuffle(idx) split_indexes: int = round(len(idx) * self.train_percentage) # append index subsets to the list of train indices train_indexes.append(idx[:split_indexes]) test_indexes.append(idx[split_indexes:(split_indexes + num_test_balanced[(s, y)])]) num_dropped += num_test[(s, y)] - num_test_balanced[(s, y)] train_idx = np.concatenate(train_indexes, axis=0) test_idx = np.concatenate(test_indexes, axis=0) train: DataTuple = DataTuple( x=data.x.iloc[train_idx].reset_index(drop=True), s=data.s.iloc[train_idx].reset_index(drop=True), y=data.y.iloc[train_idx].reset_index(drop=True), name=f"{data.name} - Train", ) test: DataTuple = DataTuple( x=data.x.iloc[test_idx].reset_index(drop=True), s=data.s.iloc[test_idx].reset_index(drop=True), y=data.y.iloc[test_idx].reset_index(drop=True), name=f"{data.name} - Test", ) unbalanced_test_len = round(len(data) * (1 - self.train_percentage)) split_info = { "seed": random_seed, "percent_dropped": num_dropped / unbalanced_test_len, self.balance_type: 1, } return train, test, split_info
def train_test_split(data: DataTuple, train_percentage: float = 0.8, random_seed: int = 0) -> Tuple[DataTuple, DataTuple]: """Split a data tuple into two datatuple along the rows of the DataFrames. Args: data: data tuple to split train_percentage: percentage for train split random_seed: seed to make splitting reproducible Returns: train split and test split """ # ======================== concatenate the datatuple to one dataframe ========================= # save the column names for later x_columns: pd.Index = data.x.columns s_columns: pd.Index = data.s.columns y_columns: pd.Index = data.y.columns all_data: pd.DataFrame = pd.concat([data.x, data.s, data.y], axis="columns") all_data = shuffle_df(all_data, random_state=1) # ============================== split the concatenated dataframe ============================= # permute all_data = shuffle_df(all_data, random_state=random_seed) # split train_len = int(train_percentage * len(all_data)) all_data_train = all_data.iloc[:train_len] # type: ignore[call-overload] all_data_test = all_data.iloc[train_len:] # type: ignore[call-overload] assert isinstance(all_data_train, pd.DataFrame) assert isinstance(all_data_test, pd.DataFrame) all_data_train = all_data_train.reset_index(drop=True) all_data_test = all_data_test.reset_index(drop=True) # ================================== assemble train and test ================================== train: DataTuple = DataTuple( x=all_data_train[x_columns], s=all_data_train[s_columns], y=all_data_train[y_columns], name=f"{data.name} - Train", ) test: DataTuple = DataTuple( x=all_data_test[x_columns], s=all_data_test[s_columns], y=all_data_test[y_columns], name=f"{data.name} - Test", ) assert isinstance(train.x, pd.DataFrame) assert isinstance(test.x, pd.DataFrame) assert_index_equal(train.x.columns, x_columns) assert_index_equal(test.x.columns, x_columns) assert isinstance(train.s, pd.DataFrame) assert isinstance(test.s, pd.DataFrame) assert_index_equal(train.s.columns, s_columns) assert_index_equal(test.s.columns, s_columns) assert isinstance(train.y, pd.DataFrame) assert isinstance(test.y, pd.DataFrame) assert_index_equal(train.y.columns, y_columns) assert_index_equal(test.y.columns, y_columns) return train, test