def test_invalid_outcome_name(self, data_type): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame if data_type == DataTypeCombinations.Incorrect: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name=1) assert "should provide the name of outcome feature as a string" in str( ve) elif data_type == DataTypeCombinations.AsNone: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name=None) assert "should provide the name of outcome feature as a string" in str( ve) else: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=feature_names) assert "should provide the name of outcome feature" in str(ve)
def _get_exp(self, backend, method="random", is_public_data_interface=True): if is_public_data_interface: dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') else: d = dice_ml.Data(features={ 'age': [17, 90], 'workclass': ['Government', 'Other/Unknown', 'Private', 'Self-Employed'], 'education': [ 'Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Prof-school', 'School', 'Some-college' ], 'marital_status': ['Divorced', 'Married', 'Separated', 'Single', 'Widowed'], 'occupation': [ 'Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar' ], 'race': ['Other', 'White'], 'gender': ['Female', 'Male'], 'hours_per_week': [1, 99] }, outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method=method) return exp
def test_invalid_continuous_features(self, data_type): iris = load_iris(as_frame=True) dataset = iris.frame if data_type == DataTypeCombinations.Incorrect: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=np.array(iris.feature_names), outcome_name='target') assert "should provide the name(s) of continuous features in the data as a list" in str( ve) elif data_type == DataTypeCombinations.AsNone: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=None, outcome_name='target') assert "should provide the name(s) of continuous features in the data as a list" in str( ve) else: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, outcome_name='target') assert 'continuous_features should be provided' in str(ve)
def test_invalid_dataframe(self, data_type): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame if data_type == DataTypeCombinations.Incorrect: with pytest.raises(ValueError, match="should provide a pandas dataframe"): dice_ml.Data(dataframe=dataset.values, continuous_features=feature_names, outcome_name='target') else: with pytest.raises(ValueError, match="should provide a pandas dataframe"): dice_ml.Data(dataframe=None, continuous_features=feature_names, outcome_name='target')
def test_numeric_categories(self, desired_range, method, create_boston_data): x_train, x_test, y_train, y_test, feature_names = \ create_boston_data rfc = RandomForestRegressor(n_estimators=10, max_depth=4, random_state=777) model = rfc.fit(x_train, y_train) dataset_train = x_train.copy() dataset_train['Outcome'] = y_train feature_names.remove('CHAS') d = dice_ml.Data(dataframe=dataset_train, continuous_features=feature_names, outcome_name='Outcome') m = dice_ml.Model(model=model, backend='sklearn', model_type='regressor') exp = dice_ml.Dice(d, m, method=method) cf_explanation = exp.generate_counterfactuals( query_instances=x_test.iloc[0:1], total_CFs=10, desired_range=desired_range) assert cf_explanation is not None
def on_button_clicked(b): with button_output: print("Generating explanations may take a few minutes...") print() #SETTING UP d = dice_ml.Data(dataframe=dataname, continuous_features=cont_feat, outcome_name=outcome_name) backend = 'TF'+tf.__version__[0] # TF2 m = dice_ml.Model(model=modelname, backend=backend) exp = dice_ml.Dice(d, m) #Generating CFs query_instance = dict(zip(feature_names, explore.queryvaluestouse)) if f.weightdropdown.value=='Use Default Weights': dice_exp = exp.generate_counterfactuals(query_instance,total_CFs=num_exp.value, desired_class="opposite", features_to_vary=f.useusing, proximity_weight=prox.value, diversity_weight=div.value) elif f.weightdropdown.value=='Choose Your Own Weights': #putting weights into dict weightstouse=dict(zip(f.useusing, f.weightvaluestouse)) dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=num_exp.value, desired_class="opposite", features_to_vary=f.useusing, feature_weights=weightstouse, proximity_weight=prox.value, diversity_weight=div.value) explore.dice_exp=dice_exp
def __init__(self, mlmodel: MLModel, hyperparams: Optional[Dict] = None) -> None: supported_backends = ["tensorflow", "pytorch"] if mlmodel.backend not in supported_backends: raise ValueError( f"{mlmodel.backend} is not in supported backends {supported_backends}" ) super().__init__(mlmodel) self._continuous = mlmodel.data.continuous self._categorical = mlmodel.data.categorical self._target = mlmodel.data.target self._model = mlmodel checked_hyperparams = merge_default_parameters( hyperparams, self._DEFAULT_HYPERPARAMS ) # Prepare data for dice data structure self._dice_data = dice_ml.Data( dataframe=mlmodel.data.df, continuous_features=self._continuous, outcome_name=self._target, ) self._dice_model = dice_ml.Model(model=mlmodel, backend="sklearn") self._dice = dice_ml.Dice(self._dice_data, self._dice_model, method="random") self._num = checked_hyperparams["num"] self._desired_class = checked_hyperparams["desired_class"] self._post_hoc_sparsity_param = checked_hyperparams["posthoc_sparsity_param"]
def test_user_data_corruption(self): dataset = helpers.load_adult_income_dataset() dataset_copy = dataset.copy() dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income', permitted_range={'age': [45, 60]}, continuous_features_precision={'hours_per_week': 2}) pd.testing.assert_frame_equal(dataset, dataset_copy)
def data_object(): dataset = helpers.load_adult_income_dataset() return dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income', permitted_range={'age': [45, 60]}, continuous_features_precision={'hours_per_week': 2})
def _get_exp(self, backend, method="random"): dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method=method) return exp
def test_min_max_equal(self): dataset = helpers.load_min_max_equal_dataset() dice_data = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') assert all( dice_data.normalize_data(dice_data.data_df)['Numerical'] == 0)
def regression_exp_object(method="random"): backend = 'sklearn' dataset = helpers.load_custom_testing_dataset_regression() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_regression() m = dice_ml.Model(model_path=ML_modelpath, backend=backend, model_type='regressor') exp = dice_ml.Dice(d, m, method=method) return exp
def random_binary_classification_exp_object(): backend = 'sklearn' dataset = helpers.load_custom_testing_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline() m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method='random') return exp
def pyt_exp_object(): backend = 'PYT' dataset = helpers.load_adult_income_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income') ML_modelpath = helpers.get_adult_income_modelpath(backend=backend) m = dice_ml.Model(model_path= ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m) return exp
def public_data_object(): """ Returns a public data object for the adult income dataset """ dataset = helpers.load_adult_income_dataset() return dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')
def binary_classification_exp_object_out_of_order(method="random"): backend = 'sklearn' dataset = helpers.load_outcome_not_last_column_dataset() d = dice_ml.Data(dataframe=dataset, continuous_features=['Numerical'], outcome_name='Outcome') ML_modelpath = helpers.get_custom_dataset_modelpath_pipeline_binary() m = dice_ml.Model(model_path=ML_modelpath, backend=backend) exp = dice_ml.Dice(d, m, method=method) return exp
def test_not_found_outcome_name(self): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame with pytest.raises(UserConfigValidationException, match="outcome_name invalid not found in"): dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='invalid')
def test_unseen_permitted_range(self): iris = load_iris(as_frame=True) feature_names = iris.feature_names permitted_range = {'age': [45, 60]} dataset = iris.frame with pytest.raises( UserConfigValidationException, match="permitted_range contains some feature names which are not part of columns in dataframe"): dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target', permitted_range=permitted_range)
def test_invalid_dataframe(self): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset.values, continuous_features=feature_names, outcome_name='target') assert "should provide a pandas dataframe" in str(ve)
def test_unseen_continuous_feature_names(self): iris = load_iris(as_frame=True) feature_names = iris.feature_names feature_names.append("new feature") dataset = iris.frame with pytest.raises( UserConfigValidationException, match="continuous_features contains some feature names which are not part of columns in dataframe"): dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target')
def test_invalid_outcome_name(self): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name=1) assert "should provide the name of outcome feature as a string" in str( ve)
def test_invalid_continuous_features(self): iris = load_iris(as_frame=True) feature_names = np.array(iris.feature_names) dataset = iris.frame with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target') assert "should provide the name(s) of continuous features in the data as a list" in str( ve)
def test_unseen_continuous_features_precision(self): iris = load_iris(as_frame=True) feature_names = iris.feature_names continuous_features_precision = {'hours_per_week': 2} dataset = iris.frame with pytest.raises( UserConfigValidationException, match="continuous_features_precision contains some feature names which" " are not part of columns in dataframe"): dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target', continuous_features_precision=continuous_features_precision)
def data_object(): features_dict = OrderedDict( [('age', [17, 90]), ('workclass', ['Government', 'Other/Unknown', 'Private', 'Self-Employed']), ('education', ['Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Prof-school', 'School', 'Some-college']), ('marital_status', ['Divorced', 'Married', 'Separated', 'Single', 'Widowed']), ('occupation', ['Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar']), ('race', ['Other', 'White']), ('gender', ['Female', 'Male']), ('hours_per_week', [1, 99])] ) # providing an OrderedDict to make it work for Python<3.6 return dice_ml.Data(features=features_dict, outcome_name='income', type_and_precision={'hours_per_week': ['float', 2]}, mad={'age': 10})
def test_invalid_dataframe(self, data_type): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame if data_type == DataTypeCombinations.Incorrect: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=dataset.values, continuous_features=feature_names, outcome_name='target') assert "should provide a pandas dataframe" in str(ve) elif data_type == DataTypeCombinations.AsNone: with pytest.raises(ValueError) as ve: dice_ml.Data(dataframe=None, continuous_features=feature_names, outcome_name='target') assert "should provide a pandas dataframe" in str(ve) else: with pytest.raises(ValueError) as ve: dice_ml.Data(continuous_features=feature_names, outcome_name='target') assert "dataframe not found in params" in str(ve)
def private_data_object(): """ Returns a private data object containing meta information about the adult income dataset """ features_dict = OrderedDict([('age', [17, 90]), ('workclass', ['Government', 'Other/Unknown', 'Private', 'Self-Employed']), ('education', ['Assoc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Prof-school', 'School', 'Some-college']), ('marital_status', ['Divorced', 'Married', 'Separated', 'Single', 'Widowed']), ('occupation', ['Blue-Collar', 'Other/Unknown', 'Professional', 'Sales', 'Service', 'White-Collar']), ('race', ['Other', 'White']), ('gender', ['Female', 'Male']), ('hours_per_week', [1, 99])]) # providing an OrderedDict to make it work for Python<=3.6 return dice_ml.Data(features=features_dict, outcome_name='income')
def test_check_features_to_vary(self, features_to_vary): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame dice_data = dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target') if features_to_vary is not None and features_to_vary != 'all': with pytest.raises( UserConfigValidationException, match="Got features {" + "'not_a_feature'" + "} which are not present in training data"): dice_data.check_features_to_vary(features_to_vary=features_to_vary) else: dice_data.check_features_to_vary(features_to_vary=features_to_vary)
def _create_diceml_explainer(self, method, continuous_features): dice_data = dice_ml.Data(dataframe=self._train, continuous_features=continuous_features, outcome_name=self._target_column) model_type = CounterfactualConstants.CLASSIFIER \ if self._task_type == ModelTask.CLASSIFICATION else \ CounterfactualConstants.REGRESSOR dice_model = dice_ml.Model(model=self._model, backend=CounterfactualConstants.SKLEARN, model_type=model_type) dice_explainer = Dice(dice_data, dice_model, method=method) return dice_explainer
def test_check_permitted_range_with_unknown_feature(self, permitted_range): iris = load_iris(as_frame=True) feature_names = iris.feature_names dataset = iris.frame dice_data = dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target') if permitted_range is not None: with pytest.raises( UserConfigValidationException, match="Got features {" + "'not_a_feature'" + "} which are not present in training data"): dice_data.check_permitted_range(permitted_range=permitted_range) else: dice_data.check_permitted_range(permitted_range=permitted_range)
def test_check_permitted_range_with_unknown_categorical_value(self): iris = load_iris(as_frame=True) permitted_range = {'new_feature': ['unknown_category']} feature_names = iris.feature_names dataset = iris.frame dataset['new_feature'] = np.repeat(['known_category'], dataset.shape[0]) dice_data = dice_ml.Data(dataframe=dataset, continuous_features=feature_names, outcome_name='target') with pytest.raises( UserConfigValidationException) as ucve: dice_data.check_permitted_range(permitted_range=permitted_range) assert 'The category {0} does not occur in the training data for feature {1}. Allowed categories are {2}'.format( 'unknown_category', 'new_feature', ['known_category']) in str(ucve)