async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: (ytrain, ), label_converter = _fix_labels([train.y.to_numpy()]) raw_data = dict( xtrain=train.x.to_numpy(), strain=train.s.to_numpy(), ytrain=ytrain, xtest=test.x.to_numpy(), stest=test.s.to_numpy(), # ytest=ytest, ) parameters = self._additional_parameters(raw_data) with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) data_path = tmp_path / "data.npz" np.savez(data_path, **raw_data) model_name = "local" flags = _flags( parameters, str(data_path), tmpdir, self.s_as_input, model_name, raw_data["ytrain"].shape[0], ) await self._run_gpyt(flags) # Read the results from the numpy file 'predictions.npz' with (tmp_path / model_name / PRED_FNAME).open("rb") as file_obj: output = np.load(file_obj) pred_mean = output["pred_mean"] predictions = label_converter( (pred_mean > 0.5).astype(raw_data["ytrain"].dtype)[:, 0]) return Prediction(hard=pd.Series(predictions))
def main(): """This function runs the Agarwal model as a standalone program.""" args: AgarwalArgs = AgarwalArgs().parse_args() train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz( Path(args.test)) Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz( Path(args.predictions))
def run(self, train: DataTuple, test: TestTuple) -> Prediction: train_y_vals = train.y.drop_duplicates() random = np.random.RandomState(self.seed) return Prediction(hard=pd.Series( random.choice(train_y_vals.T.to_numpy()[0], test.x.shape[0])))
def run(self, train: DataTuple, test: TestTuple) -> Prediction: random_state = np.random.RandomState(seed=self.seed) clf = LogisticRegression( solver="liblinear", random_state=random_state, C=self.C, multi_class="auto" ) clf.fit(train.x, train.y.to_numpy().ravel()) return Prediction(hard=pd.Series(clf.predict(test.x)))
def run(self, train: DataTuple, test: TestTuple) -> Prediction: random_state = np.random.RandomState(seed=self.seed) folder = KFold(n_splits=self.n_splits, shuffle=True, random_state=random_state) clf = LogisticRegressionCV( cv=folder, n_jobs=-1, random_state=random_state, solver="liblinear", multi_class="auto" ) clf.fit(train.x, train.y.to_numpy().ravel()) return Prediction(hard=pd.Series(clf.predict(test.x)), info=dict(C=clf.C_[0]))
def main() -> None: """This function runs the Agarwal model as a standalone program.""" args: AgarwalArgs = AgarwalArgs().parse_args() random.seed(args.seed) np.random.seed(args.seed) try: import cloudpickle # Need to install cloudpickle for now. See https://github.com/fairlearn/fairlearn/issues/569 except ImportError as e: raise RuntimeError( "In order to use Agarwal, install fairlearn and cloudpickle." ) from e if args.mode == "run": assert args.train is not None assert args.test is not None assert args.predictions is not None train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz( Path(args.test)) Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz( Path(args.predictions)) elif args.mode == "fit": assert args.train is not None assert args.model is not None data = DataTuple.from_npz(Path(args.train)) model = fit(data, args) with working_dir(Path(args.model)): model_file = cloudpickle.dumps(model) dump(model_file, Path(args.model)) elif args.mode == "predict": assert args.model is not None assert args.predictions is not None assert args.test is not None data = TestTuple.from_npz(Path(args.test)) model_file = load(Path(args.model)) with working_dir(Path(args.model)): model = cloudpickle.loads(model_file) Prediction(hard=predict(model, data)["preds"]).to_npz( Path(args.predictions)) else: raise RuntimeError(f"Unknown mode: {args.mode}")
def run(self, _: DataTuple, test: TestTuple) -> Prediction: if test.name is None or "Compas" not in test.name or "sex" not in test.s.columns: raise RuntimeError( "The Corels algorithm only works on the COMPAS dataset") age = test.x["age-num"].to_numpy() priors = test.x["priors-count"].to_numpy() sex = test.s["sex"].to_numpy() male = 1 condition1 = (age >= 18) & (age <= 20) & (sex == male) condition2 = (age >= 21) & (age <= 23) & (priors >= 2) & (priors <= 3) condition3 = priors > 3 pred = np.where(condition1 | condition2 | condition3, np.ones_like(age), np.zeros_like(age)) return Prediction(hard=pd.Series(pred))
def metric_per_sensitive_attribute( prediction: Prediction, actual: DataTuple, metric: Metric, use_sens_name: bool = True) -> Dict[str, float]: """Compute a metric repeatedly on subsets of the data that share a senstitive attribute.""" if not metric.apply_per_sensitive: raise MetricNotApplicable( f"Metric {metric.name} is not applicable per sensitive " f"attribute, apply to whole dataset instead") assert actual.s.shape[0] == actual.x.shape[0] assert actual.s.shape[0] == actual.y.shape[0] assert prediction.hard.shape[0] == actual.y.shape[0] per_sensitive_attr: Dict[str, float] = {} s_columns: List[str] = list(actual.s.columns) y_columns: List[str] = list(actual.y.columns) assert len(y_columns) == 1 for y_col in y_columns: for s_col in s_columns: for unique_s in actual.s[s_col].unique(): mask: pd.Series = actual.s[s_col] == unique_s subset = DataTuple( x=pd.DataFrame( actual.x.loc[mask][actual.x.columns], columns=actual.x.columns).reset_index(drop=True), s=pd.DataFrame(actual.s.loc[mask][s_col], columns=[s_col]).reset_index(drop=True), y=pd.DataFrame(actual.y.loc[mask][y_col], columns=[y_col]).reset_index(drop=True), name=actual.name, ) pred_y: Prediction if isinstance(prediction, SoftPrediction): pred_y = SoftPrediction( soft=prediction.soft.loc[mask].reset_index(drop=True), info=prediction.info) else: pred_y = Prediction( hard=prediction.hard.loc[mask].reset_index(drop=True), info=prediction.info) key = (s_col if use_sens_name else "S") + "_" + str(unique_s) per_sensitive_attr[key] = metric.score(pred_y, subset) return per_sensitive_attr
def _train_and_predict( train: DataTuple, test: TestTuple, classifier: ClassifierType, C: float, kernel: str, seed: int ) -> Prediction: """Train a logistic regression model and compute predictions on the given test data.""" if classifier == "SVM": model = select_svm(C=C, kernel=kernel, seed=seed) else: random_state = np.random.RandomState(seed=seed) model = LogisticRegression( solver="liblinear", random_state=random_state, max_iter=5000, C=C ) model.fit( train.x, train.y.to_numpy().ravel(), sample_weight=compute_instance_weights(train)["instance weights"], ) return Prediction(hard=pd.Series(model.predict(test.x)))
def _predict(self, test: TestTuple, tmp_path: Path, fit_params: FitParams) -> Prediction: test_path = tmp_path / "test.json" self._create_file_in_zafar_format(test, test_path, fit_params.label_converter) predictions_path = tmp_path / "predictions.json" cmd = self._get_predict_cmd(str(test_path), str(fit_params.model_path), str(predictions_path)) working_dir = self._code_path.resolve() / self._sub_dir self._call_script(cmd, cwd=working_dir) predictions = predictions_path.open().read() predictions = json.loads(predictions) predictions_correct = pd.Series( [0 if x == -1 else 1 for x in predictions]) return Prediction(hard=fit_params.label_converter.post_only_labels( predictions_correct))
async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: label_converter = LabelBinarizer() with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) train_path = tmp_path / "train.json" test_path = tmp_path / "test.json" self._create_file_in_zafar_format(train, train_path, label_converter) self._create_file_in_zafar_format(test, test_path, label_converter) predictions_path = tmp_path / "predictions.json" cmd = self._create_command_line(str(train_path), str(test_path), str(predictions_path)) working_dir = self._code_path.resolve() / self._sub_dir await self._call_script(cmd, cwd=working_dir) predictions = predictions_path.open().read() predictions = json.loads(predictions) predictions_correct = pd.Series([0 if x == -1 else 1 for x in predictions]) return Prediction(hard=label_converter.post_only_labels(predictions_correct))
async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) train_path = str(tmp_path / "train.txt") test_path = str(tmp_path / "test.txt") self.create_file_in_kamishima_format(train, train_path) self.create_file_in_kamishima_format(test, test_path) min_class_label = train.y[train.y.columns[0]].min() model_path = str(tmp_path / "model") output_path = str(tmp_path / "output.txt") # try: await self._call_script([ str(self._code_path / "train_pr.py"), "-e", str(self.eta), "-i", train_path, "-o", model_path, "--quiet", ]) await self._call_script([ str(self._code_path / "predict_lr.py"), "-i", test_path, "-m", model_path, "-o", output_path, "--quiet", ]) output = np.loadtxt(output_path) predictions = output[:, 1].astype(np.float32) # except RuntimeError: # predictions = np.ones_like(test.y.to_numpy()) to_return = pd.Series(predictions) to_return = to_return.astype(int) if to_return.min() != to_return.max(): to_return = to_return.replace(to_return.min(), min_class_label) return Prediction(hard=to_return)
async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: """Run Algorithm on the given data asynchronously. Args: train: training data test: test data """ with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) train_paths, test_paths = write_as_feather(train, test, tmp_path) pred_path = tmp_path / "predictions.npz" cmd = self._script_command(train_paths, test_paths, pred_path) await self._call_script(cmd) # wait for scrip to run results = np.load(pred_path) info = {} for key in results: if key != "preds": info[key] = float(results[key]) return Prediction(hard=pd.Series(results["preds"]), info=info)
def _predict(self, model_params: OptimizeResult, test_predictions: Prediction, test: TestTuple) -> Prediction: sp2p, sn2p, op2p, on2p = model_params.x # Create boolean conditioning vectors for protected groups mask_s1 = test.s[test.s.columns[0]].to_numpy() == 1 mask_s0 = test.s[test.s.columns[0]].to_numpy() == 0 test_preds_numpy: np.ndarray = test_predictions.hard.to_numpy() # Randomly flip labels according to the probabilities in model_params self_fair_pred = test_preds_numpy[mask_s1].copy() self_pp_indices = ( test_preds_numpy[mask_s1] == self._favorable_label).nonzero()[0] self_pn_indices = ( test_preds_numpy[mask_s1] == self._unfavorable_label).nonzero()[0] self._random.shuffle(self_pp_indices) self._random.shuffle(self_pn_indices) n2p_indices = self_pn_indices[:int(len(self_pn_indices) * sn2p)] self_fair_pred[n2p_indices] = self._favorable_label p2n_indices = self_pp_indices[:int(len(self_pp_indices) * (1 - sp2p))] self_fair_pred[p2n_indices] = self._unfavorable_label othr_fair_pred = test_preds_numpy[mask_s0].copy() othr_pp_indices = ( test_preds_numpy[mask_s0] == self._favorable_label).nonzero()[0] othr_pn_indices = ( test_preds_numpy[mask_s0] == self._unfavorable_label).nonzero()[0] self._random.shuffle(othr_pp_indices) self._random.shuffle(othr_pn_indices) n2p_indices = othr_pn_indices[:int(len(othr_pn_indices) * on2p)] othr_fair_pred[n2p_indices] = self._favorable_label p2n_indices = othr_pp_indices[:int(len(othr_pp_indices) * (1 - op2p))] othr_fair_pred[p2n_indices] = self._unfavorable_label new_labels = np.zeros_like(test_preds_numpy, dtype=np.float64) new_labels[mask_s1] = self_fair_pred new_labels[mask_s0] = othr_fair_pred return Prediction(hard=pd.Series(new_labels))
def predict(self, test: TestTuple) -> Prediction: """Run Algorithm on the given data asynchronously. Args: train: training data test: test data Returns: predictions """ with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) test_path = tmp_path / "test.npz" pred_path = tmp_path / "predictions.npz" test.to_npz(test_path) cmd = self._predict_script_command(self.model_path, test_path, pred_path) self._call_script(cmd + ["--mode", "predict"]) # wait for scrip to run return Prediction.from_npz(pred_path)
async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: """Run Algorithm on the given data asynchronously. Args: train: training data test: test data Returns: predictions """ with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) train_path = tmp_path / "train.npz" test_path = tmp_path / "test.npz" pred_path = tmp_path / "predictions.npz" train.to_npz(train_path) test.to_npz(test_path) cmd = self._script_command(train_path, test_path, pred_path) await self._call_script(cmd) # wait for scrip to run return Prediction.from_npz(pred_path)
def _predict(self, test: TestTuple, fit_info: _FitInfo, tmp_path: Path) -> Prediction: test_path = tmp_path / "test.txt" _create_file_in_kamishima_format(test, test_path) output_path = str(tmp_path / "output.txt") script = self._code_path / "predict_lr.py" cmd = [ script, "-i", test_path, "-m", fit_info.model_path, "-o", output_path, "--quiet" ] self._call_script([str(e) for e in cmd]) output = np.loadtxt(output_path) predictions = output[:, 1].astype(np.float32) # except RuntimeError: # predictions = np.ones_like(test.y.to_numpy()) to_return = pd.Series(predictions) to_return = to_return.astype(int) if to_return.min() != to_return.max(): to_return = to_return.replace(to_return.min(), fit_info.min_class_label) return Prediction(hard=to_return)
def run(self, train: DataTuple, test: TestTuple) -> Prediction: maj = train.y.mode().iloc[0].to_numpy() # type: ignore[attr-defined] return Prediction(hard=pd.Series(maj.repeat(len(test.x))))
def predict(self, test: TestTuple) -> Prediction: return Prediction(hard=pd.Series(self.clf.predict(test.x)), info=dict(C=self.clf.C_[0]))
def predict(self, test: TestTuple) -> Prediction: return Prediction(hard=pd.Series(self.maj.repeat(len(test.x))))
def predict(self, test: TestTuple) -> Prediction: return Prediction(hard=pd.Series(self.clf.predict(test.x)))
def run(self, train: DataTuple, test: TestTuple) -> Prediction: assert isinstance(test, DataTuple), "test must be a DataTuple." return Prediction(hard=test.y[test.y.columns[0]].copy())
def run(self, train: DataTuple, test: TestTuple) -> Prediction: assert isinstance(test, DataTuple), "test must be a DataTuple." flipper = DPFlip(seed=self.seed) test_preds = Prediction(test.y[test.y.columns[0]].copy()) return flipper.run(test_preds, test, test_preds, test)
def predict(self, test: TestTuple) -> Prediction: random = np.random.RandomState(self.seed) return Prediction(hard=pd.Series( random.choice(self.vals.T.to_numpy()[0], test.x.shape[0])))
def _predict(model: sklearn.linear_model._base.LinearModel, test: TestTuple) -> Prediction: return Prediction(hard=pd.Series(model.predict(test.x)))
async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction: (ytrain, ), label_converter = _fix_labels([train.y.to_numpy()]) raw_data = dict( xtrain=train.x.to_numpy(), strain=train.s.to_numpy(), ytrain=ytrain, xtest=test.x.to_numpy(), stest=test.s.to_numpy(), # ytest=ytest, ) parameters = self._additional_parameters(raw_data) with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) data_path = tmp_path / Path("data.npz") model_name = "local" # f"run{self.counter}_s_as_input_{self.s_as_input}" flags = _flags( parameters, str(data_path), tmpdir, self.s_as_input, model_name, len(raw_data["ytrain"]), ) if self.odds is None: # Split the training data into train and dev and save it to `data.npz` train_dev_data = split_train_dev(raw_data["xtrain"], raw_data["ytrain"], raw_data["strain"]) np.savez(data_path, **train_dev_data) # First run await self._run_gpyt(flags) # Read the results from the numpy file 'predictions.npz' with (tmp_path / model_name / PRED_FNAME).open("rb") as file_obj: output = np.load(file_obj) prediction_on_train = output["pred_mean"] preds = (prediction_on_train > 0.5).astype(np.int32) odds = compute_odds(train_dev_data["ytest"], preds, train_dev_data["stest"]) # Enforce equality of opportunity opportunity = min(odds["p_ybary1_s0"], odds["p_ybary1_s1"]) odds["p_ybary1_s0"] = opportunity odds["p_ybary1_s1"] = opportunity flags.update({"epochs": 2 * flags["epochs"], **odds}) else: flags.update(self.odds) # Save with real test data np.savez(data_path, **raw_data) # Second run await self._run_gpyt(flags) # Read the results from the numpy file 'predictions.npz' with (tmp_path / model_name / PRED_FNAME).open("rb") as file_obj: output = np.load(file_obj) pred_mean = output["pred_mean"] # Convert the result to the expected format predictions = label_converter( (pred_mean > 0.5).astype(raw_data["ytrain"].dtype)[:, 0]) return Prediction(hard=pd.Series(predictions))
def run(self, train: DataTuple, test: TestTuple) -> Prediction: clf = select_mlp(self.hidden_layer_sizes, self.activation) clf.fit(train.x, train.y.to_numpy().ravel()) return Prediction(hard=pd.Series(clf.predict(test.x)))
def run(self, train: DataTuple, test: Union[DataTuple, TestTuple]) -> Prediction: clf = select_svm(self.C, self.kernel) clf.fit(train.x, train.y.to_numpy().ravel()) return Prediction(hard=pd.Series(clf.predict(test.x)))