示例#1
0
def test_subset(dataset_df):
    # Change the index so that labels don't correspond to positions.
    df = dataset_df.set_index(dataset_df.index + 51)
    df["avg_col"] = (df["a"] + df["b"]) / 2
    ds = Dataset(df, "label", feature_cols=["a", "b", "c", "d", "e"])

    ds_C = ds.subset(ds.df["c"] == "C")
    assert isinstance(ds_C, Dataset)
    assert ds_C.size == 15
    assert ds_C.feature_names == ["a", "b", "c", "d", "e"]
    assert ds_C.column_names == ["a", "b", "c", "d", "e", "avg_col"]
    assert_series_equal(ds_C.labels, df.loc[df["c"] == "C", "label"])
    assert list(ds_C.other_cols.columns) == ["avg_col"]
    assert_frame_equal(ds_C.df, df[df["c"] == "C"])

    ii = list(range(51, 61))
    ds_ind = ds.subset(ii)
    assert isinstance(ds_ind, Dataset)
    assert ds_ind.size == 10
    assert ds_ind.feature_names == ["a", "b", "c", "d", "e"]
    assert ds_ind.column_names == ["a", "b", "c", "d", "e", "avg_col"]
    assert_series_equal(ds_ind.labels, df.iloc[range(10)]["label"])
    assert list(ds_ind.other_cols.columns) == ["avg_col"]
    assert_frame_equal(ds_ind.df, df.iloc[range(10)])

    ds_pos = ds.subset(ii, by_position=True)
    assert isinstance(ds_pos, Dataset)
    assert ds_pos.size == 10
    assert ds_pos.feature_names == ["a", "b", "c", "d", "e"]
    assert ds_pos.column_names == ["a", "b", "c", "d", "e", "avg_col"]
    assert_series_equal(ds_pos.labels, df.iloc[ii]["label"])
    assert list(ds_pos.other_cols.columns) == ["avg_col"]
    assert_frame_equal(ds_pos.df, df.iloc[ii])
示例#2
0
def test_dataset_other_cols(dataset_df):
    ds = Dataset(dataset_df, "label", feature_cols=["a", "b", "d"])

    assert ds.size == 100
    assert ds.feature_names == ["a", "b", "d"]
    assert ds.column_names == ["a", "b", "d", "c", "e"]
    assert_frame_equal(ds.features, dataset_df[["a", "b", "d"]])
    assert_series_equal(ds.labels, dataset_df["label"])
    assert_frame_equal(ds.other_cols, dataset_df[["c", "e"]])
    assert ds.df is dataset_df

    ds.df["avg_col"] = (dataset_df["a"] + dataset_df["b"]) / 2
    assert_frame_equal(ds.other_cols, dataset_df[["c", "e", "avg_col"]])
    assert ds.feature_names == ["a", "b", "d"]
    assert ds.column_names == ["a", "b", "d", "c", "e", "avg_col"]
示例#3
0
文件: sampling.py 项目: mozilla/PRESC
def labeling(X, original_classifier, label_col="class"):
    """Labels the samples from a dataset according to a classifier.

    Parameters
    ----------
    X : pandas DataFrame
        Dataset with the features but not the labels.
    original_classifier : sklearn-type classifier
        Classifier to use for the labeling of the samples.
    label_col : str
        Name of the label column.

    Returns
    -------
    presc.dataset.Dataset
        Outputs a PRESC Dataset with the samples and their labels.
    """
    df_labeled = X.copy()

    # Label synthetic data with original classifier
    df_labeled[label_col] = original_classifier.predict(df_labeled)

    # Instantiate dataset wrapper
    df_labeled = Dataset(df_labeled, label_col=label_col)

    return df_labeled
示例#4
0
def test_summary_metrics():
    random_seed = 42
    # Original data
    train_data = pd.DataFrame(
        {"x": [0, 1, 0, 2, 1], "y": [1, 0, 2, 0, 1], "label": [0, 0, 1, 1, 1]},
        columns=["x", "y", "label"],
    )
    test_data = Dataset(
        pd.DataFrame(
            {"x": [2, 0, 0, 1, 2], "y": [1, 0, 2, 0, 2], "label": [0, 0, 1, 0, 1]},
            columns=["x", "y", "label"],
        ),
        label_col="label",
    )

    # Original classifier
    original_classifier = SVC(kernel="linear", random_state=random_seed)
    original_classifier.fit(train_data[["x", "y"]], train_data["label"])

    # Copy classifier
    feature_parameters = {"x": {"min": 0, "max": 2}, "y": {"min": 0, "max": 2}}
    classifier_copy = DecisionTreeClassifier(max_depth=2, random_state=random_seed)
    copy_grid = ClassifierCopy(
        original_classifier,
        classifier_copy,
        grid_sampling,
        nsamples=20,
        label_col="label",
        feature_parameters=feature_parameters,
    )
    copy_grid.copy_classifier()

    # Generated data
    synthetic_test_data = copy_grid.generate_synthetic_data(
        generated_nsamples=5, random_state=random_seed, label_col="label"
    )

    metrics = summary_metrics(
        original_model=original_classifier,
        copy_model=copy_grid,
        test_data=test_data,
        synthetic_data=synthetic_test_data,
        show_results=True,
    )

    expected_results = {
        "Original Model Accuracy (test)": 0.6,
        "Copy Model Accuracy (test)": 0.8,
        "Empirical Fidelity Error (synthetic)": 0.0625,
        "Empirical Fidelity Error (test)": 0.2,
        "Replacement Capability (synthetic)": 0.9375,
        "Replacement Capability (test)": 1.33333333,
    }

    metric_names = metrics.keys()
    for name in metric_names:
        np.testing.assert_almost_equal(metrics[name], expected_results[name], decimal=6)
示例#5
0
def test_dataset(dataset_df):
    ds = Dataset(dataset_df, "label")

    assert ds.size == 100
    assert ds.feature_names == ["a", "b", "c", "d", "e"]
    assert ds.column_names == ["a", "b", "c", "d", "e"]
    assert_frame_equal(ds.features, dataset_df.drop(columns=["label"]))
    assert_series_equal(ds.labels, dataset_df["label"])
    assert ds.other_cols.size == 0
    assert ds.df is dataset_df
示例#6
0
    def generate_synthetic_data(self, **k_mod_sampling_parameters):
        """Generates synthetic data using the original model.

        Generates samples following the sampling strategy specified on
        instantiation for the numerical features and a discrete distribution for
        the categorical features, and then labels them using the original model.
        If the same data needs to be generated then simply use a specific
        random seed.

        Parameters
        ----------
        **k_mod_sampling_parameters :
            If the "nsamples" and/or "random_state" parameters of the sampling
            function have to be changed in order to obtain a different set of
            synthetic data, they can be specified here.

        Returns
        -------
        presc.dataset.Dataset
            Outputs a PRESC Dataset with the generated samples and their labels.
        """
        # Random state needs to be fixed to obtain the same training data
        k_sampling_parameters_gen = self.k_sampling_parameters.copy()

        if "nsamples" in k_mod_sampling_parameters.keys():
            k_sampling_parameters_gen["nsamples"] = k_mod_sampling_parameters[
                "nsamples"]
        if "random_state" in k_mod_sampling_parameters.keys():
            k_sampling_parameters_gen[
                "random_state"] = k_mod_sampling_parameters["random_state"]

        X_generated = mixed_data_sampling(
            numerical_sampling=self.sampling_function,
            **k_sampling_parameters_gen)

        # If the type of sampling function attempts to balance the synthetic
        # dataset, it returns the features AND the labels. Otherwise, it returns
        # only the features, and the labeling function must be called.
        if self.balancing_sampler:
            df_generated = Dataset(X_generated, label_col=self.label_col)
        else:
            df_generated = labeling(X_generated,
                                    self.original,
                                    label_col=self.label_col)

        return df_generated
示例#7
0
文件: conftest.py 项目: mozilla/PRESC
def test_dataset(dataset_df, in_test_set):
    return Dataset(dataset_df[in_test_set], label_col="label")
示例#8
0
def multiclass_gaussians(
    nsamples=3000,
    nfeatures=30,
    nclasses=15,
    center_low=2,
    center_high=10,
    scale_low=1,
    scale_high=1,
):
    """Generates a multidimensional gaussian dataset with multiple classes.

    This function generates a multidimensional normal distribution centered at
    the origin with standard deviation one for class zero. And then adds an
    additional gaussian distribution per class, centered at a random distance
    between `center_low` and `center_high`, and with random standard deviation
    between `scale_low` and `scale_high`.

    Parameters
    ----------
    nsamples : int
        Maximum number of samples to generate. Actual number of samples depends
        on the number of classes, because the function yields a balanced
        dataset with the same number of samples per class.
    nfeatures : int
        Number of features of the generated samples.
    nclasses : int
        Number of classes in the generated dataset.
    center_low : float
        Minimum translation from the origin of the center of the gaussian
        distributions corresponding to additional classes.
    center_high : float
        Maximum translation from the origin of the center of the gaussian
        distributions corresponding to additional classes.
    scale_low : float
        Minimum value for the standard deviation of the gaussian distributions
        corresponding to additional classes.
    scale_high : float
        Maximum value for the standard deviation of the gaussian distributions
        corresponding to additional classes.

    Returns
    -------
    presc.dataset.Dataset
        Outputs a PRESC Dataset with the generated samples and their labels.
    """
    class_samples = int(nsamples / nclasses)

    # Create class zero drawing samples from a `nfeatures`-dimensional normal
    # distribution centered at the origin and with a standard deviation between
    # `scale_low` and `scale_high`.
    scale = np.random.uniform(low=scale_low, high=scale_high)
    t_pred = scale * np.random.normal(0, 1, (class_samples, nfeatures))
    df_pred = pd.DataFrame(t_pred)
    df_pred["class"] = 0

    # Create additional classes centered at `m` with standard deviation `scale`
    for i in range(1, nclasses):
        # Generate a normalized vector in a random direction
        v = np.random.normal(0, 1, nfeatures)
        v = v / np.linalg.norm(v)

        # Generate a random distance from the origin to define the center of each gaussian
        alpha = np.random.uniform(low=center_low, high=center_high)
        m = alpha * v

        # Generate a random scaling for each gaussian
        scale = np.random.uniform(low=scale_low, high=scale_high)

        # Generate normally distributed random samples for this classs
        t = m + scale * np.random.normal(0, 1, (class_samples, nfeatures))
        df = pd.DataFrame(t)
        df["class"] = i

        # Add class data to the dataset
        df_pred = pd.concat([df_pred, df], ignore_index=True)

    # Convert into PRESC Dataset
    df_presc = Dataset(df_pred, label_col="class")
    return df_presc
def vehicles_dataset_wrapper():
    dataset_wrapper = Dataset(pd.read_csv(VEHICLES_DATA_PATH),
                              VEHICLES_LABEL_COL)
    dataset_wrapper.split_test_train(test_size=0.4, random_state=random_state)
    return dataset_wrapper
示例#10
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import ShuffleSplit

from pathlib import Path

THIS_DIR = Path(__file__).parent
DATASET_DIR = THIS_DIR / ".." / ".." / "datasets" / "winequality.csv"

# Load the dataset.

df = pd.read_csv(DATASET_DIR)
df = df.drop(columns=["quality"])
dataset = Dataset(df, label_col="recommend")

splitter = ShuffleSplit(n_splits=1, test_size=0.3, random_state=543)
train_ind, test_ind = next(splitter.split(dataset.features))
train_dataset = dataset.subset(train_ind, by_position=True)
test_dataset = dataset.subset(test_ind, by_position=True)

# Set up the model

model = Pipeline([("scaler", StandardScaler()),
                  ("clf", SVC(class_weight="balanced"))])
cm = ClassificationModel(model)
cm.train(train_dataset)

presc_report = ReportRunner()
presc_report.run(model=cm,
示例#11
0
import pandas as pd

from presc.dataset import Dataset
from presc.model import ClassificationModel

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Better quality plots
from IPython.display import set_matplotlib_formats

set_matplotlib_formats("svg")

# Load the dataset.

df = pd.read_csv("../../datasets/winequality.csv")
df = df.drop(columns=["quality"])

dataset = Dataset(df, label="recommend")
dataset.split_test_train(0.3)

# Set up the model

model = Pipeline([("scaler", StandardScaler()), ("clf", SVC(class_weight="balanced"))])
cm = ClassificationModel(model, dataset, should_train=True)

# Config options (TODO: read from file)
config = {"misclass_rate": {"num_bins": 20}}
示例#12
0
def wine_dataset_wrapper(expected_wine_dataset):
    dataset_wrapper = Dataset(expected_wine_dataset, WINE_LABEL_COL)
    return dataset_wrapper
示例#13
0
def test_label_not_in_dataset(expected_wine_dataset):
    with pytest.raises(KeyError):
        Dataset(expected_wine_dataset, "wrong_label")