def test_performance_not_deteriorate(): '''Compare the model performance to baselines. It's a bit unclear what to compare against since performance varies widely across models (in mse; vanilla settings): decision tree regressor: 6946 random forest regressor: 2970 linear model: 2820 ''' clf = SkopeRules(regression=True, max_depth_duplication=None, max_depth=X.shape[1] // 3, n_estimators=850, precision_min=0., recall_min=.0, feature_names=feature_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) mse = mean_squared_error(y_test, y_pred) # comparing to a baseline from linear regression: assert mse < 2820
def test_performances(): X, y = make_blobs(n_samples=1000, random_state=0, centers=2) # make labels imbalanced by remove all but 100 instances from class 1 indexes = np.ones(X.shape[0]).astype(bool) ind = np.array([False] * 100 + list(((y == 1)[100:]))) indexes[ind] = 0 X = X[indexes] y = y[indexes] n_samples, n_features = X.shape clf = SkopeRules() # fit clf.fit(X, y) # with lists clf.fit(X.tolist(), y.tolist()) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.83) # decision_function agrees with predict decision = -clf.decision_function(X) assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() < 0).astype(np.int) assert_array_equal(dec_pred, y_pred)
def test_can_predict(): clf = SkopeRules(regression=True, max_depth_duplication=2, n_estimators=30, precision_min=0.20, recall_min=0.20, feature_names=feature_names) clf.fit(X, y) clf.predict(X)
def test_creates_rules(): clf = SkopeRules(regression=True, max_depth_duplication=2, n_estimators=30, precision_min=0.0, recall_min=0.0, feature_names=feature_names) clf.fit(X, y) rules = clf.rules_ assert len(rules) > 0
def KfoldAcc(X, y, multiclass=False, k=10): #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set neigh = KNeighborsClassifier() neigh.fit(X_train, y_train) neigh_y_pred = neigh.predict(X_test) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_train, y_train) tree_y_pred = tree.predict(X_test) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) print('---------') naive = GaussianNB() naive.fit(X_train, y_train) naive_y_pred = naive.predict(X_test) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) print('---------') rule = SkopeRules() if multiclass is True: rule = OneVsRestClassifier(rule) rule.fit(X_train, y_train) rules_y_pred = rule.predict(X_test) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) return accuracy
def test_deduplication_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1)
def test_skope_rules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) separate_rules_score = clf.separate_rules_score(X_test) pred = clf.predict(X_test) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2])) assert_greater(np.min(separate_rules_score[-2:]), np.max(separate_rules_score[:-2])) assert_array_equal(pred, 6 * [0] + 2 * [1])
def _getSkopeRules(X_train, y_train, model_params): # Rules print("Obtaining Rules using SkopeRules...") clf = SkopeRules(**model_params) clf.fit(X_train, y_train) rules = clf.rules_ if len(rules) > 0: print("Checking inliers inside hypercubes...") df_rules = pd.DataFrame({ "rule": [v[0].replace(" and ", " & ") for v in rules], "precision": [v[1][0] for v in rules], "recall": [v[1][1] for v in rules], "n_points_correct": [v[1][2] for v in rules], }) if not df_rules.empty: df_rules["size_rules"] = df_rules.apply( lambda x: len(x["rule"].split("&")), axis=1) else: df_rules["size_rules"] = 0 rules = [v[0].replace(" and ", " & ") for v in rules] # Obtain rules in df format if len(rules) > 0: print("Turning rules to hypercubes...") df_rules_results = turn_rules_to_df(list_rules=rules, list_cols=feature_cols) df_rules_pruned = simplifyRules(df_rules_results, categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_rules.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules_results = df_rules_pruned else: df_rules_results = pd.DataFrame() return df_rules_results
def main(): mail = get_feat_scores() #panda table train, test = train_test_split(mail, test_size=0.3) #split up data x_train = train.drop(columns=['label']) #remove labels from test x y_train = train.drop(columns=['message', 'sf', 'hf']) cv = CountVectorizer(input='content', stop_words=stp.words('english'), ngram_range=(1, 2)) x_tr = cv.fit_transform( x_train.message) #vectorize x_train text for algorithm skr = SkopeRules(n_estimators=30, feature_names=['sf', 'hf']) #algorithm y_train = y_train.to_numpy().ravel( ) #turn y_train into a 1d array for algorithm y_train = y_train.astype('int') skr.fit(x_tr.toarray(), y_train) #test data x_test = train.drop(columns=['label']) y_test = train.drop(columns=['message', 'sf', 'hf']) x_tst = cv.transform(x_test.message) y_test = y_test.to_numpy().ravel() y_test = y_test.astype('int') y_score = skr.score_top_rules(x_tst.toarray()) #metrics recall_scr = recall_score(y_test, y_score, average='micro') f1_scr = f1_score(y_test, y_score, average='micro') pr_score = precision_score(y_test, y_score, average='micro') print("recall: " + str(recall_scr)) print("f1: " + str(f1_scr)) print("precision: " + str(pr_score)) #plot precision, recall, r = precision_recall_curve(y_test, y_score) plt.plot(recall, precision) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision Recall curve') plt.show()
def OverSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] sm = SMOTE() kf = KFold(n_splits=k, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_over, y_over = sm.fit_resample(X_train, y_train) #oversampled train set print('oversample:', X_over.shape, y_over.shape) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_over, y_over) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_over, y_over) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_over, y_over) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_over, y_over) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
def BalanceSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] rus_balance = RandomUnderSampler( sampling_strategy=0.20) #truncate neg to 5*#pos sm_balance = SMOTE() #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_bal, y_bal = rus_balance.fit_resample(X_train, y_train) #BALANCED SAMPLE print('1.under:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal) print('2.over:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_bal, y_bal) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_bal, y_bal) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_bal, y_bal) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_bal, y_bal) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
from sklearn.datasets import load_iris from skrules import SkopeRules from matplotlib import pyplot as plt import seaborn as sns import pandas as pd dataset = load_iris() print(dataset) feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] clf = SkopeRules(max_depth_duplication=2, n_estimators=30, precision_min=0.3, recall_min=0.1, feature_names=feature_names) for idx, species in enumerate(dataset.target_names): X, y = dataset.data, dataset.target clf.fit(X, y == idx) rules = clf.rules_[0:3] print("Rules for iris", species) for rule in rules: print(rule) print() print(20*'=') print()
class DecisionListClassifier(ClassifierMixin, ExplainerMixin): """ Decision List Classifier Currently a slight variant of SkopeRules from skope-rules. https://github.com/scikit-learn-contrib/skope-rules """ available_explanations = ["global", "local"] explainer_type = "model" def __init__(self, feature_names=None, feature_types=None, **kwargs): """ Initializes skope rules. Args: **kwargs: Keyword arguments to be passed to SkopeRules in skope-rules. """ self.feature_names = feature_names self.feature_types = feature_types self.kwargs = kwargs def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types ) self.feature_index_ = [ "feature_" + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules( self.sk_model_.rules_ ) self.global_selector = gen_global_selector( X, self.feature_names, self.feature_types, None ) return self def predict(self, X): """ Predicts on provided instances. Args: X: Numpy array for instances. Returns: Predicted class label per instance. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self.predict_proba(X) return self.classes_[np.argmax(scores, axis=1)] def _scores(self, X): df = pd.DataFrame(X, columns=self.feature_index_) selected_rules = self.internal_rules_ scores = np.ones(X.shape[0]) * np.inf for k, r in enumerate(selected_rules): matched_idx = list(df.query(r[0]).index) scores[matched_idx] = np.minimum(k, scores[matched_idx]) scores[np.isinf(scores)] = len(selected_rules) scores = scores.astype("int64") return scores def predict_proba(self, X): """ Provides probability estimates on provided instances. Args: X: Numpy array for instances. Returns: Probability estimate of instance for each class. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self._scores(X) prec_ar = np.array(self.prec_) return np.c_[1.0 - prec_ar[scores], prec_ar[scores]] def _extract_rules(self, rules): rules = deepcopy(rules) rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True)) rule_li = [] prec_li = [] recall_li = [] predict_li = [] features_dict = {feat: [] for feat in self.feature_names} def extract_orig_features(pattern, rule): feature_set = set() for m in re.finditer(pattern, rule): orig_feature = self.feature_map_[m.group(1)] feature_set.add(orig_feature) return feature_set for indx, rule_rec in enumerate(rules): rule = rule_rec[0] rule_round = " ".join( [ "{0:.2f}".format(float(x)) if x.replace(".", "", 1).isdigit() else x for x in rule.split(" ") ] ) pattern = r"(feature_[0-9]+)" feature_set = extract_orig_features(pattern, rule_round) rule_fix = re.sub( pattern, lambda m: self.feature_map_[m.group(1)], rule_round ) rule_li.append(rule_fix) prec_li.append(rule_rec[1][0]) recall_li.append(rule_rec[1][1]) predict_li.append(1.0) for feat in feature_set: features_dict[feat].append(indx) # Provide default rule rule_li.append("No Rules Triggered") prec_li.append(self.pos_ratio_) recall_li.append(1.0) predict_li.append(0.0) return rules, rule_li, prec_li, recall_li, features_dict def explain_local(self, X, y=None, name=None): if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) scores = self._scores(X) outcomes = self.predict(X) prob_scores = self.predict_proba(X) data_dicts = [] for idx, score in enumerate(scores): data_dict = { "type": "rule", "rule": [self.rules_[score]], "precision": [self.prec_[score]], "recall": [self.recall_[score]], "outcome": [outcomes[idx]], } data_dicts.append(data_dict) internal_obj = {"overall": None, "specific": data_dicts} selector = gen_local_selector(X, y, prob_scores[:, 1]) return RulesExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, ) def explain_global(self, name=None): """ Provides global explanation for model. Args: name: User-defined explanation name. Returns: An explanation object, visualizing feature-value pairs as horizontal bar chart. """ if name is None: name = gen_name_from_class(self) # Extract rules rules, prec, recall, feat_rule_map = ( self.rules_, self.prec_, self.recall_, self.feat_rule_map_, ) outcomes = [self.classes_[1]] * (len(self.rules_) - 1) # Add the zero case for the default rule outcomes.append(self.classes_[0]) overall_data_dict = { "type": "rule", "rule": rules, "precision": prec, "recall": recall, "outcome": outcomes, } data_dicts = [ { "type": "rule", "rule": [rules[i] for i in feat_rule_map[feature]], "precision": [prec[i] for i in feat_rule_map[feature]], "recall": [recall[i] for i in feat_rule_map[feature]], "outcome": [outcomes[i] for i in feat_rule_map[feature]], } for feature in self.feature_names ] internal_obj = {"overall": overall_data_dict, "specific": data_dicts} return RulesExplanation( "global", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=self.global_selector, )
class DecisionListClassifier(ClassifierMixin, ExplainerMixin): """ Decision List Classifier Currently a slight variant of SkopeRules from skope-rules. https://github.com/scikit-learn-contrib/skope-rules """ available_explanations = ['global', 'local'] explainer_type = 'model' def __init__(self, feature_names=None, feature_types=None, **kwargs): """ Initializes skope rules. Args: **kwargs: Keyword arguments to be passed to SkopeRules in skope-rules. """ self.feature_names = feature_names self.feature_types = feature_types self.kwargs = kwargs def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types) self.feature_index_ = [ 'feature_' + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = \ self._extract_rules(self.sk_model_.rules_) self.global_selector = gen_global_selector(X, self.feature_names, self.feature_types, None) return self def predict(self, X): """ Predicts on provided instances. Args: X: Numpy array for instances. Returns: Predicted class label per instance. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self.predict_proba(X) return self.classes_[np.argmax(scores, axis=1)] def _scores(self, X): df = pd.DataFrame(X, columns=self.feature_index_) selected_rules = self.internal_rules_ scores = np.ones(X.shape[0]) * np.inf for k, r in enumerate(selected_rules): matched_idx = list(df.query(r[0]).index) scores[matched_idx] = np.minimum(k, scores[matched_idx]) scores[np.isinf(scores)] = len(selected_rules) scores = scores.astype('int64') return scores def predict_proba(self, X): """ Provides probability estimates on provided instances. Args: X: Numpy array for instances. Returns: Probability estimate of instance for each class. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self._scores(X) prec_ar = np.array(self.prec_) return np.c_[1.0 - prec_ar[scores], prec_ar[scores]] def _extract_rules(self, rules): rules = deepcopy(rules) rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True)) rule_li = [] prec_li = [] recall_li = [] predict_li = [] features_dict = {feat: [] for feat in self.feature_names} def extract_orig_features(pattern, rule): feature_set = set() for m in re.finditer(pattern, rule): orig_feature = self.feature_map_[m.group(1)] feature_set.add(orig_feature) return feature_set for indx, rule_rec in enumerate(rules): rule = rule_rec[0] rule_round = ' '.join([ '{0:.2f}'.format(float(x)) if x.replace('.', '', 1).isdigit() else x for x in rule.split(' ') ]) pattern = r'(feature_[0-9]+)' feature_set = extract_orig_features(pattern, rule_round) rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)], rule_round) rule_li.append(rule_fix) prec_li.append(rule_rec[1][0]) recall_li.append(rule_rec[1][1]) predict_li.append(1.0) for feat in feature_set: features_dict[feat].append(indx) # Provide default rule rule_li.append('No Rules Triggered') prec_li.append(self.pos_ratio_) recall_li.append(1.0) predict_li.append(0.0) return rules, rule_li, prec_li, recall_li, features_dict def explain_local(self, X, y=None, name=None): if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) scores = self._scores(X) outcomes = self.predict(X) prob_scores = self.predict_proba(X) data_dicts = [] for idx, score in enumerate(scores): data_dict = { 'type': 'rule', 'rule': [self.rules_[score]], 'precision': [self.prec_[score]], 'recall': [self.recall_[score]], 'outcome': [outcomes[idx]], } data_dicts.append(data_dict) internal_obj = { 'overall': None, 'specific': data_dicts, } selector = gen_local_selector(X, y, prob_scores[:, 1]) return RulesExplanation('local', internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector) def explain_global(self, name=None): """ Provides global explanation for model. Args: name: User-defined explanation name. Returns: An explanation object, visualizing feature-value pairs as horizontal bar chart. """ if name is None: name = gen_name_from_class(self) # Extract rules rules, prec, recall, feat_rule_map = \ self.rules_, self.prec_, self.recall_, self.feat_rule_map_ outcomes = [self.classes_[1]] * (len(self.rules_) - 1) # Add the zero case for the default rule outcomes.append(self.classes_[0]) overall_data_dict = { 'type': 'rule', 'rule': rules, 'precision': prec, 'recall': recall, 'outcome': outcomes } data_dicts = [{ 'type': 'rule', 'rule': [rules[i] for i in feat_rule_map[feature]], 'precision': [prec[i] for i in feat_rule_map[feature]], 'recall': [recall[i] for i in feat_rule_map[feature]], 'outcome': [outcomes[i] for i in feat_rule_map[feature]], } for feature in self.feature_names] internal_obj = { 'overall': overall_data_dict, 'specific': data_dicts, } return RulesExplanation('global', internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=self.global_selector)
naive = GaussianNB() rule = SkopeRules(feature_names= data.columns.to_list()[0:18]) neigh.fit(X_train, y_train) y_pred = neigh.predict(X_test) accuracy_no['neigh'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) tree.fit(X_train, y_train) y_pred = tree.predict(X_test) accuracy_no['tree'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) naive.fit(X_train, y_train) y_pred = naive.predict(X_test) accuracy_no['naive'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) rule.fit(X_train, y_train) y_pred = rule.predict(X_test) accuracy_no['rule'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) #feature sel w/ boruta for train_index, test_index in kf.split(X1): X_train, X_test = X1[train_index], X1[test_index] y_train, y_test = y_bal[train_index], y_bal[test_index] #test set neigh = KNeighborsClassifier() ## tree = DecisionTreeClassifier() naive = GaussianNB() rule = SkopeRules() neigh.fit(X_train, y_train)
############################################################################### # Getting rules with skrules # .................. # # This part shows how SkopeRules can be fitted to detect credit defaults. # Performances are compared with the random forest model previously trained. # fit the model clf = SkopeRules( similarity_thres=.9, max_depth=3, max_features=0.5, max_samples_features=0.5, random_state=rng, n_estimators=30, feature_names=feature_names, recall_min=0.02, precision_min=0.6 ) clf.fit(X_train, y_train) # in the separate_rules_score method, a score of k means that rule number k # vote positively, but not rules 1, ..., k-1. It will allow us to plot # performance of each rule separately on ROC and PR plots. scoring = clf.separate_rules_score(X_test) print(str(len(clf.rules_)) + ' rules have been built.') print('The most precise rules are the following:') print(clf.rules_[:5]) curves = [roc_curve, precision_recall_curve] xlabels = ['False Positive Rate', 'Recall (True Positive Rate)'] ylabels = ['True Positive Rate (Recall)', 'Precision']
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from skrules import SkopeRules from sklearn.preprocessing import OneHotEncoder from collections import Counter # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = SkopeRules(max_depth_duplication=self.params["max_depth_duplication"], n_estimators=self.params["n_estimators"], precision_min=self.params["precision_min"], recall_min=self.params["recall_min"], max_samples=self.params["max_samples"], max_samples_features=self.params["max_samples_features"], max_depth=self.params["max_depth"], max_features=self.params["max_features"], min_samples_split=self.params["min_samples_split"], bootstrap=self.params["bootstrap"], bootstrap_features=self.params["bootstrap_features"], random_state=self.params["random_state"], feature_names=orig_cols) else: # Skopes doesn't work for regression loggerinfo(logger, "PASS, no skopes model") pass # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or (X_datatypes[col_count] == 'object')] self.X_numeric = [item for item in orig_cols if item not in self.X_categorical] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: loggerinfo(logger, "PCategorical encode") for colname in self.X_categorical: X[colname] = list(X[colname].fillna("Missing")) self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list(self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories)], axis=1) # Replace missing values with a missing value code if len(self.X_numeric) > 0: for colname in self.X_numeric: X[colname] = list(X[colname].fillna(-999)) model.fit(np.array(X), np.array(y)) # Find the rule list self.rule_list = model.rules_ # Calculate feature importances var_imp = [] for var in orig_cols: var_imp.append(sum(int(var in item[0]) for item in self.rule_list)) if max(var_imp) != 0: importances = list(np.array(var_imp) / max(var_imp)) else: importances = [1] * len(var_imp) pd.DataFrame(model.rules_, columns=['Rule', '(Precision, Recall, nb)']).to_csv( os.path.join(tmp_folder, 'Skope_rules.csv'), index=False) self.mean_target = np.array(sum(y) / len(y)) # Set model properties self.set_model_properties(model=model, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
if dat in ('http', 'smtp'): y = (y != b'normal.').astype(int) print_outlier_ratio(y) n_samples, n_features = X.shape n_samples_train = n_samples // 2 X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('--- Fitting the SkopeRules estimator...') model = SkopeRules(n_estimators=5, max_depth=5, n_jobs=-1) tstart = time() model.fit(X_train, y_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color='black') ax[0].set_title('Decision function for %s dataset' % dat) ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') ax[1].legend(loc="lower right") ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers') ax[2].legend(loc="lower right")
class DecisionListClassifier(ClassifierMixin, ExplainerMixin): # pragma: no cover """ Decision List Classifier Currently a slight variant of SkopeRules from skope-rules. https://github.com/scikit-learn-contrib/skope-rules """ available_explanations = ["global", "local"] explainer_type = "model" def __init__(self, feature_names=None, feature_types=None, **kwargs): """ Initializes class. Args: feature_names: List of feature names. feature_types: List of feature types. **kwargs: Kwargs passed to wrapped SkopeRules at initialization time. """ self.feature_names = feature_names self.feature_types = feature_types self.kwargs = kwargs def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ from skrules import SkopeRules as SR X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types) self.feature_index_ = [ "feature_" + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules ( self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_, ) = self._extract_rules(self.sk_model_.rules_) self.global_selector = gen_global_selector(X, self.feature_names, self.feature_types, None) return self def predict(self, X): """ Predicts on provided instances. Args: X: Numpy array for instances. Returns: Predicted class label per instance. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self.predict_proba(X) return self.classes_[np.argmax(scores, axis=1)] def _scores(self, X): df = pd.DataFrame(X, columns=self.feature_index_) selected_rules = self.internal_rules_ scores = np.ones(X.shape[0]) * np.inf for k, r in enumerate(selected_rules): matched_idx = list(df.query(r[0]).index) scores[matched_idx] = np.minimum(k, scores[matched_idx]) scores[np.isinf(scores)] = len(selected_rules) scores = scores.astype("int64") return scores def predict_proba(self, X): """ Provides probability estimates on provided instances. Args: X: Numpy array for instances. Returns: Probability estimate of instance for each class. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self._scores(X) prec_ar = np.array(self.prec_) return np.c_[1.0 - prec_ar[scores], prec_ar[scores]] def _extract_rules(self, rules): rules = deepcopy(rules) rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True)) rule_li = [] prec_li = [] recall_li = [] predict_li = [] features_dict = {feat: [] for feat in self.feature_names} def extract_orig_features(pattern, rule): feature_set = set() for m in re.finditer(pattern, rule): orig_feature = self.feature_map_[m.group(1)] feature_set.add(orig_feature) return feature_set for indx, rule_rec in enumerate(rules): rule = rule_rec[0] rule_round = " ".join([ "{0:.2f}".format(float(x)) if x.replace(".", "", 1).isdigit() else x for x in rule.split(" ") ]) pattern = r"(feature_[0-9]+)" feature_set = extract_orig_features(pattern, rule_round) rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)], rule_round) rule_li.append(rule_fix) prec_li.append(rule_rec[1][0]) recall_li.append(rule_rec[1][1]) predict_li.append(1.0) for feat in feature_set: features_dict[feat].append(indx) # Provide default rule rule_li.append("No Rules Triggered") prec_li.append(self.pos_ratio_) recall_li.append(1.0) predict_li.append(0.0) return rules, rule_li, prec_li, recall_li, features_dict def explain_local(self, X, y=None, name=None): """ Provides local explanations for provided instances. Args: X: Numpy array for X to explain. y: Numpy vector for y to explain. name: User-defined explanation name. Returns: An explanation object. """ if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) scores = self._scores(X) outcomes = self.predict(X) prob_scores = self.predict_proba(X) data_dicts = [] for idx, score in enumerate(scores): data_dict = { "type": "rule", "rule": [self.rules_[score]], "precision": [self.prec_[score]], "recall": [self.recall_[score]], "outcome": [outcomes[idx]], } data_dicts.append(data_dict) internal_obj = {"overall": None, "specific": data_dicts} selector = gen_local_selector(data_dicts, is_classification=True) return RulesExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, ) def explain_global(self, name=None): """ Provides global explanation for model. Args: name: User-defined explanation name. Returns: An explanation object. """ if name is None: name = gen_name_from_class(self) # Extract rules rules, prec, recall, feat_rule_map = ( self.rules_, self.prec_, self.recall_, self.feat_rule_map_, ) outcomes = [self.classes_[1]] * (len(self.rules_) - 1) # Add the zero case for the default rule outcomes.append(self.classes_[0]) overall_data_dict = { "type": "rule", "rule": rules, "precision": prec, "recall": recall, "outcome": outcomes, } data_dicts = [{ "type": "rule", "rule": [rules[i] for i in feat_rule_map[feature]], "precision": [prec[i] for i in feat_rule_map[feature]], "recall": [recall[i] for i in feat_rule_map[feature]], "outcome": [outcomes[i] for i in feat_rule_map[feature]], } for feature in self.feature_names] internal_obj = {"overall": overall_data_dict, "specific": data_dicts} return RulesExplanation( "global", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=self.global_selector, )
# Train the network, then make predictions on the test set and print the results neural_network = compute_semi_supervised_learning(neural_network, X_train, Y_train) neural_network_pred = np.array(neural_network.predict_classes( np.array(X_test))) print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test) # ************* Rule Model: ************************ # Here we compare 3 nearest neighbour models on the validation set # First skope rules model rule_clf1 = SkopeRules(n_estimators=50, precision_min=0.2, recall_min=0.2, feature_names=feature_names) rule_clf1.fit(X_val, Y_val) rule_clf1_ypred = rule_clf1.predict(X_test_val) print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred, Y_test_val) # Second skope rules model rule_clf2 = SkopeRules(n_estimators=50, precision_min=0.2, recall_min=0.2, feature_names=feature_names) rule_clf2.fit(X_val, Y_val) rule_clf2_ypred = rule_clf2.predict(X_test_val) print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred, Y_test_val) # Third skope rules model rule_clf3 = SkopeRules(n_estimators=25, precision_min=0.2,