示例#1
0
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        (ytrain, ), label_converter = _fix_labels([train.y.to_numpy()])
        raw_data = dict(
            xtrain=train.x.to_numpy(),
            strain=train.s.to_numpy(),
            ytrain=ytrain,
            xtest=test.x.to_numpy(),
            stest=test.s.to_numpy(),
            # ytest=ytest,
        )
        parameters = self._additional_parameters(raw_data)

        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            data_path = tmp_path / "data.npz"
            np.savez(data_path, **raw_data)
            model_name = "local"
            flags = _flags(
                parameters,
                str(data_path),
                tmpdir,
                self.s_as_input,
                model_name,
                raw_data["ytrain"].shape[0],
            )
            await self._run_gpyt(flags)

            # Read the results from the numpy file 'predictions.npz'
            with (tmp_path / model_name / PRED_FNAME).open("rb") as file_obj:
                output = np.load(file_obj)
                pred_mean = output["pred_mean"]

        predictions = label_converter(
            (pred_mean > 0.5).astype(raw_data["ytrain"].dtype)[:, 0])
        return Prediction(hard=pd.Series(predictions))
示例#2
0
def main():
    """This function runs the Agarwal model as a standalone program."""
    args: AgarwalArgs = AgarwalArgs().parse_args()
    train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz(
        Path(args.test))
    Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz(
        Path(args.predictions))
示例#3
0
    def run(self, train: DataTuple, test: TestTuple) -> Prediction:
        train_y_vals = train.y.drop_duplicates()

        random = np.random.RandomState(self.seed)

        return Prediction(hard=pd.Series(
            random.choice(train_y_vals.T.to_numpy()[0], test.x.shape[0])))
示例#4
0
 def run(self, train: DataTuple, test: TestTuple) -> Prediction:
     random_state = np.random.RandomState(seed=self.seed)
     clf = LogisticRegression(
         solver="liblinear", random_state=random_state, C=self.C, multi_class="auto"
     )
     clf.fit(train.x, train.y.to_numpy().ravel())
     return Prediction(hard=pd.Series(clf.predict(test.x)))
示例#5
0
 def run(self, train: DataTuple, test: TestTuple) -> Prediction:
     random_state = np.random.RandomState(seed=self.seed)
     folder = KFold(n_splits=self.n_splits, shuffle=True, random_state=random_state)
     clf = LogisticRegressionCV(
         cv=folder, n_jobs=-1, random_state=random_state, solver="liblinear", multi_class="auto"
     )
     clf.fit(train.x, train.y.to_numpy().ravel())
     return Prediction(hard=pd.Series(clf.predict(test.x)), info=dict(C=clf.C_[0]))
示例#6
0
def main() -> None:
    """This function runs the Agarwal model as a standalone program."""
    args: AgarwalArgs = AgarwalArgs().parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    try:
        import cloudpickle

        # Need to install cloudpickle for now. See https://github.com/fairlearn/fairlearn/issues/569
    except ImportError as e:
        raise RuntimeError(
            "In order to use Agarwal, install fairlearn and cloudpickle."
        ) from e

    if args.mode == "run":
        assert args.train is not None
        assert args.test is not None
        assert args.predictions is not None
        train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz(
            Path(args.test))
        Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz(
            Path(args.predictions))
    elif args.mode == "fit":
        assert args.train is not None
        assert args.model is not None
        data = DataTuple.from_npz(Path(args.train))
        model = fit(data, args)
        with working_dir(Path(args.model)):
            model_file = cloudpickle.dumps(model)
        dump(model_file, Path(args.model))
    elif args.mode == "predict":
        assert args.model is not None
        assert args.predictions is not None
        assert args.test is not None
        data = TestTuple.from_npz(Path(args.test))
        model_file = load(Path(args.model))
        with working_dir(Path(args.model)):
            model = cloudpickle.loads(model_file)
        Prediction(hard=predict(model, data)["preds"]).to_npz(
            Path(args.predictions))
    else:
        raise RuntimeError(f"Unknown mode: {args.mode}")
 def run(self, _: DataTuple, test: TestTuple) -> Prediction:
     if test.name is None or "Compas" not in test.name or "sex" not in test.s.columns:
         raise RuntimeError(
             "The Corels algorithm only works on the COMPAS dataset")
     age = test.x["age-num"].to_numpy()
     priors = test.x["priors-count"].to_numpy()
     sex = test.s["sex"].to_numpy()
     male = 1
     condition1 = (age >= 18) & (age <= 20) & (sex == male)
     condition2 = (age >= 21) & (age <= 23) & (priors >= 2) & (priors <= 3)
     condition3 = priors > 3
     pred = np.where(condition1 | condition2 | condition3,
                     np.ones_like(age), np.zeros_like(age))
     return Prediction(hard=pd.Series(pred))
示例#8
0
def metric_per_sensitive_attribute(
        prediction: Prediction,
        actual: DataTuple,
        metric: Metric,
        use_sens_name: bool = True) -> Dict[str, float]:
    """Compute a metric repeatedly on subsets of the data that share a senstitive attribute."""
    if not metric.apply_per_sensitive:
        raise MetricNotApplicable(
            f"Metric {metric.name} is not applicable per sensitive "
            f"attribute, apply to whole dataset instead")

    assert actual.s.shape[0] == actual.x.shape[0]
    assert actual.s.shape[0] == actual.y.shape[0]
    assert prediction.hard.shape[0] == actual.y.shape[0]

    per_sensitive_attr: Dict[str, float] = {}

    s_columns: List[str] = list(actual.s.columns)
    y_columns: List[str] = list(actual.y.columns)
    assert len(y_columns) == 1

    for y_col in y_columns:
        for s_col in s_columns:
            for unique_s in actual.s[s_col].unique():
                mask: pd.Series = actual.s[s_col] == unique_s
                subset = DataTuple(
                    x=pd.DataFrame(
                        actual.x.loc[mask][actual.x.columns],
                        columns=actual.x.columns).reset_index(drop=True),
                    s=pd.DataFrame(actual.s.loc[mask][s_col],
                                   columns=[s_col]).reset_index(drop=True),
                    y=pd.DataFrame(actual.y.loc[mask][y_col],
                                   columns=[y_col]).reset_index(drop=True),
                    name=actual.name,
                )
                pred_y: Prediction
                if isinstance(prediction, SoftPrediction):
                    pred_y = SoftPrediction(
                        soft=prediction.soft.loc[mask].reset_index(drop=True),
                        info=prediction.info)
                else:
                    pred_y = Prediction(
                        hard=prediction.hard.loc[mask].reset_index(drop=True),
                        info=prediction.info)
                key = (s_col if use_sens_name else "S") + "_" + str(unique_s)
                per_sensitive_attr[key] = metric.score(pred_y, subset)

    return per_sensitive_attr
示例#9
0
def _train_and_predict(
    train: DataTuple, test: TestTuple, classifier: ClassifierType, C: float, kernel: str, seed: int
) -> Prediction:
    """Train a logistic regression model and compute predictions on the given test data."""
    if classifier == "SVM":
        model = select_svm(C=C, kernel=kernel, seed=seed)
    else:
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(
            solver="liblinear", random_state=random_state, max_iter=5000, C=C
        )
    model.fit(
        train.x,
        train.y.to_numpy().ravel(),
        sample_weight=compute_instance_weights(train)["instance weights"],
    )
    return Prediction(hard=pd.Series(model.predict(test.x)))
示例#10
0
    def _predict(self, test: TestTuple, tmp_path: Path,
                 fit_params: FitParams) -> Prediction:
        test_path = tmp_path / "test.json"
        self._create_file_in_zafar_format(test, test_path,
                                          fit_params.label_converter)
        predictions_path = tmp_path / "predictions.json"
        cmd = self._get_predict_cmd(str(test_path), str(fit_params.model_path),
                                    str(predictions_path))
        working_dir = self._code_path.resolve() / self._sub_dir
        self._call_script(cmd, cwd=working_dir)
        predictions = predictions_path.open().read()
        predictions = json.loads(predictions)

        predictions_correct = pd.Series(
            [0 if x == -1 else 1 for x in predictions])
        return Prediction(hard=fit_params.label_converter.post_only_labels(
            predictions_correct))
示例#11
0
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        label_converter = LabelBinarizer()
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            train_path = tmp_path / "train.json"
            test_path = tmp_path / "test.json"
            self._create_file_in_zafar_format(train, train_path, label_converter)
            self._create_file_in_zafar_format(test, test_path, label_converter)
            predictions_path = tmp_path / "predictions.json"

            cmd = self._create_command_line(str(train_path), str(test_path), str(predictions_path))
            working_dir = self._code_path.resolve() / self._sub_dir
            await self._call_script(cmd, cwd=working_dir)
            predictions = predictions_path.open().read()
            predictions = json.loads(predictions)

        predictions_correct = pd.Series([0 if x == -1 else 1 for x in predictions])
        return Prediction(hard=label_converter.post_only_labels(predictions_correct))
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            train_path = str(tmp_path / "train.txt")
            test_path = str(tmp_path / "test.txt")
            self.create_file_in_kamishima_format(train, train_path)
            self.create_file_in_kamishima_format(test, test_path)
            min_class_label = train.y[train.y.columns[0]].min()
            model_path = str(tmp_path / "model")
            output_path = str(tmp_path / "output.txt")

            # try:
            await self._call_script([
                str(self._code_path / "train_pr.py"),
                "-e",
                str(self.eta),
                "-i",
                train_path,
                "-o",
                model_path,
                "--quiet",
            ])

            await self._call_script([
                str(self._code_path / "predict_lr.py"),
                "-i",
                test_path,
                "-m",
                model_path,
                "-o",
                output_path,
                "--quiet",
            ])
            output = np.loadtxt(output_path)
            predictions = output[:, 1].astype(np.float32)
            # except RuntimeError:
            #     predictions = np.ones_like(test.y.to_numpy())

        to_return = pd.Series(predictions)
        to_return = to_return.astype(int)

        if to_return.min() != to_return.max():
            to_return = to_return.replace(to_return.min(), min_class_label)
        return Prediction(hard=to_return)
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        """Run Algorithm on the given data asynchronously.

        Args:
            train: training data
            test: test data
        """
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            train_paths, test_paths = write_as_feather(train, test, tmp_path)
            pred_path = tmp_path / "predictions.npz"
            cmd = self._script_command(train_paths, test_paths, pred_path)
            await self._call_script(cmd)  # wait for scrip to run
            results = np.load(pred_path)
            info = {}
            for key in results:
                if key != "preds":
                    info[key] = float(results[key])
            return Prediction(hard=pd.Series(results["preds"]), info=info)
    def _predict(self, model_params: OptimizeResult,
                 test_predictions: Prediction, test: TestTuple) -> Prediction:
        sp2p, sn2p, op2p, on2p = model_params.x

        # Create boolean conditioning vectors for protected groups
        mask_s1 = test.s[test.s.columns[0]].to_numpy() == 1
        mask_s0 = test.s[test.s.columns[0]].to_numpy() == 0

        test_preds_numpy: np.ndarray = test_predictions.hard.to_numpy()

        # Randomly flip labels according to the probabilities in model_params
        self_fair_pred = test_preds_numpy[mask_s1].copy()
        self_pp_indices = (
            test_preds_numpy[mask_s1] == self._favorable_label).nonzero()[0]
        self_pn_indices = (
            test_preds_numpy[mask_s1] == self._unfavorable_label).nonzero()[0]
        self._random.shuffle(self_pp_indices)
        self._random.shuffle(self_pn_indices)

        n2p_indices = self_pn_indices[:int(len(self_pn_indices) * sn2p)]
        self_fair_pred[n2p_indices] = self._favorable_label
        p2n_indices = self_pp_indices[:int(len(self_pp_indices) * (1 - sp2p))]
        self_fair_pred[p2n_indices] = self._unfavorable_label

        othr_fair_pred = test_preds_numpy[mask_s0].copy()
        othr_pp_indices = (
            test_preds_numpy[mask_s0] == self._favorable_label).nonzero()[0]
        othr_pn_indices = (
            test_preds_numpy[mask_s0] == self._unfavorable_label).nonzero()[0]
        self._random.shuffle(othr_pp_indices)
        self._random.shuffle(othr_pn_indices)

        n2p_indices = othr_pn_indices[:int(len(othr_pn_indices) * on2p)]
        othr_fair_pred[n2p_indices] = self._favorable_label
        p2n_indices = othr_pp_indices[:int(len(othr_pp_indices) * (1 - op2p))]
        othr_fair_pred[p2n_indices] = self._unfavorable_label

        new_labels = np.zeros_like(test_preds_numpy, dtype=np.float64)
        new_labels[mask_s1] = self_fair_pred
        new_labels[mask_s0] = othr_fair_pred

        return Prediction(hard=pd.Series(new_labels))
示例#15
0
    def predict(self, test: TestTuple) -> Prediction:
        """Run Algorithm on the given data asynchronously.

        Args:
            train: training data
            test: test data

        Returns:
            predictions
        """
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            test_path = tmp_path / "test.npz"
            pred_path = tmp_path / "predictions.npz"
            test.to_npz(test_path)
            cmd = self._predict_script_command(self.model_path, test_path,
                                               pred_path)
            self._call_script(cmd +
                              ["--mode", "predict"])  # wait for scrip to run
            return Prediction.from_npz(pred_path)
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        """Run Algorithm on the given data asynchronously.

        Args:
            train: training data
            test: test data

        Returns:
            predictions
        """
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            train_path = tmp_path / "train.npz"
            test_path = tmp_path / "test.npz"
            pred_path = tmp_path / "predictions.npz"
            train.to_npz(train_path)
            test.to_npz(test_path)
            cmd = self._script_command(train_path, test_path, pred_path)
            await self._call_script(cmd)  # wait for scrip to run
            return Prediction.from_npz(pred_path)
示例#17
0
    def _predict(self, test: TestTuple, fit_info: _FitInfo,
                 tmp_path: Path) -> Prediction:
        test_path = tmp_path / "test.txt"
        _create_file_in_kamishima_format(test, test_path)
        output_path = str(tmp_path / "output.txt")
        script = self._code_path / "predict_lr.py"
        cmd = [
            script, "-i", test_path, "-m", fit_info.model_path, "-o",
            output_path, "--quiet"
        ]
        self._call_script([str(e) for e in cmd])
        output = np.loadtxt(output_path)
        predictions = output[:, 1].astype(np.float32)
        # except RuntimeError:
        #     predictions = np.ones_like(test.y.to_numpy())

        to_return = pd.Series(predictions)
        to_return = to_return.astype(int)

        if to_return.min() != to_return.max():
            to_return = to_return.replace(to_return.min(),
                                          fit_info.min_class_label)
        return Prediction(hard=to_return)
 def run(self, train: DataTuple, test: TestTuple) -> Prediction:
     maj = train.y.mode().iloc[0].to_numpy()  # type: ignore[attr-defined]
     return Prediction(hard=pd.Series(maj.repeat(len(test.x))))
示例#19
0
 def predict(self, test: TestTuple) -> Prediction:
     return Prediction(hard=pd.Series(self.clf.predict(test.x)), info=dict(C=self.clf.C_[0]))
 def predict(self, test: TestTuple) -> Prediction:
     return Prediction(hard=pd.Series(self.maj.repeat(len(test.x))))
示例#21
0
 def predict(self, test: TestTuple) -> Prediction:
     return Prediction(hard=pd.Series(self.clf.predict(test.x)))
示例#22
0
 def run(self, train: DataTuple, test: TestTuple) -> Prediction:
     assert isinstance(test, DataTuple), "test must be a DataTuple."
     return Prediction(hard=test.y[test.y.columns[0]].copy())
示例#23
0
 def run(self, train: DataTuple, test: TestTuple) -> Prediction:
     assert isinstance(test, DataTuple), "test must be a DataTuple."
     flipper = DPFlip(seed=self.seed)
     test_preds = Prediction(test.y[test.y.columns[0]].copy())
     return flipper.run(test_preds, test, test_preds, test)
示例#24
0
    def predict(self, test: TestTuple) -> Prediction:
        random = np.random.RandomState(self.seed)

        return Prediction(hard=pd.Series(
            random.choice(self.vals.T.to_numpy()[0], test.x.shape[0])))
示例#25
0
def _predict(model: sklearn.linear_model._base.LinearModel, test: TestTuple) -> Prediction:
    return Prediction(hard=pd.Series(model.predict(test.x)))
示例#26
0
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        (ytrain, ), label_converter = _fix_labels([train.y.to_numpy()])
        raw_data = dict(
            xtrain=train.x.to_numpy(),
            strain=train.s.to_numpy(),
            ytrain=ytrain,
            xtest=test.x.to_numpy(),
            stest=test.s.to_numpy(),
            # ytest=ytest,
        )
        parameters = self._additional_parameters(raw_data)

        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            data_path = tmp_path / Path("data.npz")
            model_name = "local"  # f"run{self.counter}_s_as_input_{self.s_as_input}"
            flags = _flags(
                parameters,
                str(data_path),
                tmpdir,
                self.s_as_input,
                model_name,
                len(raw_data["ytrain"]),
            )

            if self.odds is None:
                # Split the training data into train and dev and save it to `data.npz`
                train_dev_data = split_train_dev(raw_data["xtrain"],
                                                 raw_data["ytrain"],
                                                 raw_data["strain"])
                np.savez(data_path, **train_dev_data)

                # First run
                await self._run_gpyt(flags)

                # Read the results from the numpy file 'predictions.npz'
                with (tmp_path / model_name /
                      PRED_FNAME).open("rb") as file_obj:
                    output = np.load(file_obj)
                    prediction_on_train = output["pred_mean"]
                preds = (prediction_on_train > 0.5).astype(np.int32)
                odds = compute_odds(train_dev_data["ytest"], preds,
                                    train_dev_data["stest"])

                # Enforce equality of opportunity
                opportunity = min(odds["p_ybary1_s0"], odds["p_ybary1_s1"])
                odds["p_ybary1_s0"] = opportunity
                odds["p_ybary1_s1"] = opportunity
                flags.update({"epochs": 2 * flags["epochs"], **odds})
            else:
                flags.update(self.odds)

            # Save with real test data
            np.savez(data_path, **raw_data)

            # Second run
            await self._run_gpyt(flags)

            # Read the results from the numpy file 'predictions.npz'
            with (tmp_path / model_name / PRED_FNAME).open("rb") as file_obj:
                output = np.load(file_obj)
                pred_mean = output["pred_mean"]

        # Convert the result to the expected format
        predictions = label_converter(
            (pred_mean > 0.5).astype(raw_data["ytrain"].dtype)[:, 0])
        return Prediction(hard=pd.Series(predictions))
 def run(self, train: DataTuple, test: TestTuple) -> Prediction:
     clf = select_mlp(self.hidden_layer_sizes, self.activation)
     clf.fit(train.x, train.y.to_numpy().ravel())
     return Prediction(hard=pd.Series(clf.predict(test.x)))
 def run(self, train: DataTuple, test: Union[DataTuple,
                                             TestTuple]) -> Prediction:
     clf = select_svm(self.C, self.kernel)
     clf.fit(train.x, train.y.to_numpy().ravel())
     return Prediction(hard=pd.Series(clf.predict(test.x)))