def test_model_backcompat_local(mimic_explainer): class DummyModel: def predict(self, X): return X['TotalBalance'] dummy_model = DummyModel() model_file = 'old_mimic_model2.json' if not path.exists(model_file): model_file = path.join('test', model_file) with open(model_file, 'r') as file: data = file.read() properties = json.loads(data) explainer = mimic_explainer._load(dummy_model, properties) eval_data = retrieve_dataset('backcompat_data.csv') df = pd.DataFrame( np.random.randint(0, eval_data.shape[0], size=(eval_data.shape[0], 674 - 5))) eval_data = eval_data[eval_data.columns[-5:]] eval_data = pd.concat([df, eval_data], axis=1) local_explanation = explainer.explain_local(eval_data) assert local_explanation._local_importance_values.shape[ 1] == explainer.surrogate_model.model._n_features global_explanation = explainer.explain_global(eval_data) assert len(global_explanation.global_importance_values ) == explainer.surrogate_model.model._n_features
def test_get_local_raw_explanations_sparse_regression( self, mimic_explainer): X, y = retrieve_dataset('a1a.svmlight') x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=7) # Fit a linear regression model model = create_sklearn_linear_regressor(x_train, y_train) explainer = mimic_explainer( model, x_train, LinearExplainableModel, explainable_model_args={'sparse_data': True}) global_explanation = explainer.explain_global(x_test) assert global_explanation.method == LINEAR_METHOD num_engineered_feats = x_train.shape[1] feature_map = np.eye(5, num_engineered_feats) global_raw_explanation = global_explanation.get_raw_explanation( [feature_map]) self.validate_global_raw_explanation_regression( global_explanation, global_raw_explanation, feature_map)
def create_msx_data(self, test_size): sparse_matrix = retrieve_dataset('msx_transformed_2226.npz') sparse_matrix_x = sparse_matrix[:, :sparse_matrix.shape[1] - 2] sparse_matrix_y = sparse_matrix[:, (sparse_matrix.shape[1] - 2):(sparse_matrix.shape[1] - 1)] return train_test_split(sparse_matrix_x, sparse_matrix_y, test_size=test_size, random_state=7)
def create_reviews_data(test_size): reviews_data = retrieve_dataset('reviews.json') papers = reviews_data['paper'] reviews = [] evaluation = [] for paper in papers: if paper['review'] is None or not paper['review']: continue reviews.append(paper['review'][0]['text']) evaluation.append(paper['review'][0]['evaluation']) return train_test_split(reviews, evaluation, test_size=test_size, random_state=7)
def create_cancer_data(): # Import cancer dataset cancer = retrieve_dataset('breast-cancer.train.csv', na_values='?').interpolate().astype('int64') cancer_target = cancer.iloc[:, 0] cancer_data = cancer.iloc[:, 1:] feature_names = cancer_data.columns.values target_names = ['no_cancer', 'cancer'] # Split data into train and test x_train, x_test, y_train, y_validation = train_test_split(cancer_data, cancer_target, test_size=0.2, random_state=0) return x_train, x_test, y_train, y_validation, feature_names, target_names
def create_energy_data(): # Import energy data energy_data = retrieve_dataset('energyefficiency2012_data.train.csv') # Get the Y1 column target = energy_data.iloc[:, len(energy_data.columns) - 2] energy_data = energy_data.iloc[:, :len(energy_data.columns) - 3] feature_names = energy_data.columns.values # Split data into train and test x_train, x_test, y_train, y_validation = train_test_split(energy_data, target, test_size=0.2, random_state=0) return x_train, x_test, y_train, y_validation, feature_names
def test_raw_timestamp_explanation(self, mimic_explainer): df = retrieve_dataset( 'insurance_claims.csv', na_values='?', parse_dates=['policy_bind_date', 'incident_date']) label = 'fraud_reported' df_y = df[label] df_X = df.drop(columns=label) x_train, x_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=7) str_cols = df_X.select_dtypes( exclude=[np.number, np.datetime64]).columns.tolist() dt_cols = df_X.select_dtypes(include=[np.datetime64]).columns.tolist() numeric_cols = df_X.select_dtypes(include=[np.number]).columns.tolist() transforms_list = [] for str_col in str_cols: transforms_list.append( (str_col, Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('ohe', OneHotEncoder(sparse=False))]), [str_col])) for numeric_col in numeric_cols: transforms_list.append( (numeric_col, Pipeline(steps=[('imputer', SimpleImputer( strategy='mean')), ('scaler', StandardScaler())]), [numeric_col])) for dt_col in dt_cols: transforms_list.append( (dt_col, Pipeline(steps=[('scaler', StandardScaler())]), [dt_col])) transformations = ColumnTransformer(transforms_list) x_train_transformed = transformations.fit_transform(x_train) model = create_lightgbm_classifier(x_train_transformed, y_train) model_task = ModelTask.Classification features = df_X.columns.tolist() explainer = mimic_explainer(model, x_train, LGBMExplainableModel, transformations=transformations, features=features, model_task=model_task) explanation = explainer.explain_global(x_train) dashboard_pipeline = Pipeline(steps=[('preprocess', transformations), ('model', model)]) ExplanationDashboard(explanation, dashboard_pipeline, datasetX=x_train, trueY=y_train)
def test_explain_model_sparse_tree(self, tabular_explainer): X, y = retrieve_dataset('a1a.svmlight') x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.002, random_state=7) # Fit a random forest regression model model = create_sklearn_random_forest_regressor(x_train, y_train) _, cols = x_train.shape shape = 1, cols background = csr_matrix(shape, dtype=x_train.dtype) # Create tabular explainer exp = tabular_explainer(model, background) test_logger.info('Running explain global for test_explain_model_sparse_tree') policy = SamplingPolicy(allow_eval_sampling=True) exp.explain_global(x_test, sampling_policy=policy)
def test_explain_model_string_classes(self, mimic_explainer): adult_census_income = retrieve_dataset('AdultCensusIncome.csv', skipinitialspace=True) X = adult_census_income.drop(['income'], axis=1) y = adult_census_income[['income']] features = X.columns.values.tolist() classes = y['income'].unique().tolist() pipe_cfg = { 'num_cols': X.dtypes[X.dtypes == 'int64'].index.values.tolist(), 'cat_cols': X.dtypes[X.dtypes == 'object'].index.values.tolist(), } num_pipe = Pipeline([('num_imputer', SimpleImputer(strategy='median')), ('num_scaler', StandardScaler())]) cat_pipe = Pipeline([ ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')), ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) ]) feat_pipe = ColumnTransformer([ ('num_pipe', num_pipe, pipe_cfg['num_cols']), ('cat_pipe', cat_pipe, pipe_cfg['cat_cols']) ]) X_train = X.copy() y_train = y.copy() X_train.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) X_train = feat_pipe.fit_transform(X_train) model = SGDClassifier() model = model.fit(X_train, y_train['income']) model_task = ModelTask.Classification explainer = mimic_explainer(model, X.iloc[:1000], LinearExplainableModel, augment_data=True, max_num_of_augmentations=10, features=features, classes=classes, model_task=model_task, transformations=feat_pipe) global_explanation = explainer.explain_global(X.iloc[:1000]) assert global_explanation.method == LINEAR_METHOD self._verify_predictions_and_replication_metric( explainer, X.iloc[:1000])
def test_explain_model_imbalanced_classes(self, mimic_explainer): model = retrieve_model('unbalanced_model.pkl') x_train = retrieve_dataset('unbalanced_dataset.npz') model_predictions = model.predict(x_train) # Assert the model's predictions are skewed assert len(np.unique(model_predictions)) == 2 explainable_model = LGBMExplainableModel explainer = mimic_explainer(model, x_train, explainable_model, max_num_of_augmentations=10) global_explanation = explainer.explain_global(x_train, include_local=True) # There should be an explanation per feature assert len(global_explanation.global_importance_values) == 1585 # We should get back an explanation for each class assert len(global_explanation.local_importance_values) == 3 # Get the underlying multiclass model surrogate_predictions = explainer.surrogate_model.model.predict(x_train) assert len(np.unique(surrogate_predictions)) == 2 assert len(np.unique(model_predictions)) == 2 assert np.isclose(surrogate_predictions, model_predictions).all()
def load_msx(): Z = retrieve_dataset('msx_transformed_2226.npz') return Z[:, :-2], Z[:, -2].toarray().flatten(), "msx", LinearRegression()
experiment = f'{args["dataset"]}_test' if args['name'] == '' else args['name'] # Define the compute device (either GPU or CPU) compute_device = torch.device(args['gpu'] if torch.cuda.is_available() else 'cpu') # Set up a parameters object for saving hyperparameters, etc. parameters = parameters.Parameters(experiment, 'test', **args) with open(os.path.abspath(f'{args["network_dir"]}{experiment}_parameters.pkl'), 'rb') as f: parameters = pickle.load(f) # Create the data transforms for each respective set transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]) # Retrieve the datasets _, val_dataset, test_dataset = retrieve_dataset(args['dataset'], args['image_dir'], transform, transform, test_equals_val=True) val_dataloader = DataLoader(val_dataset, batch_size=args['batch_size'], shuffle=False) test_dataloader = DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False) # Create the network, (potentially) load network state dictionary, and send the network to the compute device num_classes = val_dataset.num_classes() loader = retrieve_network(args['dataset'], args['network']) network = loader(num_classes=num_classes) network.load_state_dict(torch.load(os.path.abspath(f'{args["network_dir"]}{experiment}/{experiment}.pth'), map_location='cpu')) network.eval() # Send to GPU network = network.to(compute_device) # Get the batch size