def test_performance_not_deteriorate(): '''Compare the model performance to baselines. It's a bit unclear what to compare against since performance varies widely across models (in mse; vanilla settings): decision tree regressor: 6946 random forest regressor: 2970 linear model: 2820 ''' clf = SkopeRules(regression=True, max_depth_duplication=None, max_depth=X.shape[1] // 3, n_estimators=850, precision_min=0., recall_min=.0, feature_names=feature_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) mse = mean_squared_error(y_test, y_pred) # comparing to a baseline from linear regression: assert mse < 2820
def test_similarity_tree(): # Test that rules are well splitted rules = [ ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), ("a > 2 and b > 45", (0.5, 0.3, 0)), ("a > 2 and b > 40", (0.5, 0.2, 0)), ("a <= 2 and b <= 45", (1, 1, 0)), ("a > 2 and c <= 3", (1, 1, 0)), ("b > 45", (1, 1, 0)), ] sk = SkopeRules(max_depth_duplication=2) rulesets = sk._find_similar_rulesets(rules) # Assert some couples of rules are in the same bag idx_bags_rules = [] for idx_rule, r in enumerate(rules): idx_bags_for_rule = [] for idx_bag, bag in enumerate(rulesets): if r in bag: idx_bags_for_rule.append(idx_bag) idx_bags_rules.append(idx_bags_for_rule) assert_equal(idx_bags_rules[0], idx_bags_rules[1]) assert_not_equal(idx_bags_rules[0], idx_bags_rules[2]) # Assert the best rules are kept final_rules = sk.deduplicate(rules) assert_in(rules[0], final_rules) assert_in(rules[2], final_rules) assert_not_in(rules[3], final_rules)
def test_skope_rules(): """Check various parameter settings.""" X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, -7]] y_train = [0] * 6 + [1] * 2 X_test = np.array([[2, 1], [1, 1]]) grid = ParameterGrid({ "feature_names": [None, ['a', 'b']], "precision_min": [0.], "recall_min": [0.], "n_estimators": [1], "max_samples": [0.5, 4], "max_samples_features": [0.5, 2], "bootstrap": [True, False], "bootstrap_features": [True, False], "max_depth": [2], "max_features": ["auto", 1, 0.1], "min_samples_split": [2, 0.1], "n_jobs": [-1, 2] }) with ignore_warnings(): for params in grid: SkopeRules(random_state=rng, **params).fit(X_train, y_train).predict(X_test) # additional parameters: SkopeRules(n_estimators=50, max_samples=1., recall_min=0., precision_min=0.).fit(X_train, y_train).predict(X_test)
def test_f1_score(): clf = SkopeRules() rule0 = ('a > 0', (0, 0, 0)) rule1 = ('a > 0', (0.5, 0.5, 0)) rule2 = ('a > 0', (0.5, 0, 0)) assert_equal(clf.f1_score(rule0), 0) assert_equal(clf.f1_score(rule1), 0.5) assert_equal(clf.f1_score(rule2), 0)
def test_creates_rules(): clf = SkopeRules(regression=True, max_depth_duplication=2, n_estimators=30, precision_min=0.0, recall_min=0.0, feature_names=feature_names) clf.fit(X, y) rules = clf.rules_ assert len(rules) > 0
def test_performances(): X, y = make_blobs(n_samples=1000, random_state=0, centers=2) # make labels imbalanced by remove all but 100 instances from class 1 indexes = np.ones(X.shape[0]).astype(bool) ind = np.array([False] * 100 + list(((y == 1)[100:]))) indexes[ind] = 0 X = X[indexes] y = y[indexes] n_samples, n_features = X.shape clf = SkopeRules() # fit clf.fit(X, y) # with lists clf.fit(X.tolist(), y.tolist()) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.83) # decision_function agrees with predict decision = -clf.decision_function(X) assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() < 0).astype(np.int) assert_array_equal(dec_pred, y_pred)
def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ from skrules import SkopeRules as SR X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types) self.feature_index_ = [ "feature_" + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules( self.sk_model_.rules_) self.global_selector = gen_global_selector(X, self.feature_names, self.feature_types, None) return self
def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types ) self.feature_index_ = [ "feature_" + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules( self.sk_model_.rules_ ) self.global_selector = gen_global_selector( X, self.feature_names, self.feature_types, None ) return self
def ML_exp(X, y_true, feature_names): from sklearn import tree from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from skrules import SkopeRules from tensorflow.keras import optimizers clfs = {} clfs['KNN'] = KNeighborsClassifier(n_neighbors=3) clfs['DT'] = tree.DecisionTreeClassifier() clfs['NB'] = GaussianNB() clfs['RB'] = SkopeRules(max_depth_duplication=None, n_estimators=30, precision_min=0.6, recall_min=0.01, feature_names=feature_names) mlp = getMLP(X.shape[-1], num_class=2) mlp.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.001), metrics=['accuracy']) clfs['MLP'] = mlp clfs['Voting'] = ensemble.get_VotingClassifier_ensemble_model( feature_names) boosting_clfs = ensemble.get_ada_boosting_clfs(feature_names) for key in boosting_clfs: clfs[key] = boosting_clfs[key] wrong_instances_clf, f1_records = cross_validation(clfs, X, y_true) return wrong_instances_clf, f1_records
def test_max_samples_attribute(): X = iris.data y = iris.target y = (y != 0) clf = SkopeRules(max_samples=1.).fit(X, y) assert_equal(clf.max_samples_, X.shape[0]) clf = SkopeRules(max_samples=500) assert_warns_message( UserWarning, "max_samples will be set to n_samples for estimation", clf.fit, X, y) assert_equal(clf.max_samples_, X.shape[0]) clf = SkopeRules(max_samples=0.4).fit(X, y) assert_equal(clf.max_samples_, 0.4 * X.shape[0])
def get_VotingClassifier_ensemble_model(feature_names): from sklearn import ensemble from sklearn.ensemble import VotingClassifier from sklearn import tree from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from skrules import SkopeRules KNN = KNeighborsClassifier(n_neighbors=3) decision_tree = tree.DecisionTreeClassifier() NB=GaussianNB() RB=SkopeRules(max_depth_duplication=None, n_estimators=30, precision_min=0.6, recall_min=0.01, feature_names=feature_names) eclf1 = VotingClassifier(estimators=[('KNN', KNN), ('DT', decision_tree), ('NB', NB)], voting='hard') # model_1.fit(X_train,y_train) # model_2.fit(X_train,y_train) # model_3.fit(X_train,y_train) # model_4.fit(X_train,y_train) # pred1=model_1.predict(X_test) # pred2=model_2.predict(X_test) # pred3=model_3.predict(X_test) # pred4=model_4.predict(X_test) # final_pred = np.array([]) # print("Ensemble model: Voting System") # for i in range(0,len(X_test)): # final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i],pred4[i]])) # print(final_pred) # return final_pred return eclf1
def test_deduplication_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1)
def _getSkopeRules(X_train, y_train, model_params): # Rules print("Obtaining Rules using SkopeRules...") clf = SkopeRules(**model_params) clf.fit(X_train, y_train) rules = clf.rules_ if len(rules) > 0: print("Checking inliers inside hypercubes...") df_rules = pd.DataFrame({ "rule": [v[0].replace(" and ", " & ") for v in rules], "precision": [v[1][0] for v in rules], "recall": [v[1][1] for v in rules], "n_points_correct": [v[1][2] for v in rules], }) if not df_rules.empty: df_rules["size_rules"] = df_rules.apply( lambda x: len(x["rule"].split("&")), axis=1) else: df_rules["size_rules"] = 0 rules = [v[0].replace(" and ", " & ") for v in rules] # Obtain rules in df format if len(rules) > 0: print("Turning rules to hypercubes...") df_rules_results = turn_rules_to_df(list_rules=rules, list_cols=feature_cols) df_rules_pruned = simplifyRules(df_rules_results, categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_rules.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules_results = df_rules_pruned else: df_rules_results = pd.DataFrame() return df_rules_results
def test_can_predict(): clf = SkopeRules(regression=True, max_depth_duplication=2, n_estimators=30, precision_min=0.20, recall_min=0.20, feature_names=feature_names) clf.fit(X, y) clf.predict(X)
def test_skope_rules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2])) assert_greater(np.min(score_top_rules[-2:]), np.max(score_top_rules[:-2])) assert_array_equal(pred, 6 * [0] + 2 * [1]) assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1])
def main(): mail = get_feat_scores() #panda table train, test = train_test_split(mail, test_size=0.3) #split up data x_train = train.drop(columns=['label']) #remove labels from test x y_train = train.drop(columns=['message', 'sf', 'hf']) cv = CountVectorizer(input='content', stop_words=stp.words('english'), ngram_range=(1, 2)) x_tr = cv.fit_transform( x_train.message) #vectorize x_train text for algorithm skr = SkopeRules(n_estimators=30, feature_names=['sf', 'hf']) #algorithm y_train = y_train.to_numpy().ravel( ) #turn y_train into a 1d array for algorithm y_train = y_train.astype('int') skr.fit(x_tr.toarray(), y_train) #test data x_test = train.drop(columns=['label']) y_test = train.drop(columns=['message', 'sf', 'hf']) x_tst = cv.transform(x_test.message) y_test = y_test.to_numpy().ravel() y_test = y_test.astype('int') y_score = skr.score_top_rules(x_tst.toarray()) #metrics recall_scr = recall_score(y_test, y_score, average='micro') f1_scr = f1_score(y_test, y_score, average='micro') pr_score = precision_score(y_test, y_score, average='micro') print("recall: " + str(recall_scr)) print("f1: " + str(f1_scr)) print("precision: " + str(pr_score)) #plot precision, recall, r = precision_recall_curve(y_test, y_score) plt.plot(recall, precision) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision Recall curve') plt.show()
def KfoldAcc(X, y, multiclass=False, k=10): #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set neigh = KNeighborsClassifier() neigh.fit(X_train, y_train) neigh_y_pred = neigh.predict(X_test) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_train, y_train) tree_y_pred = tree.predict(X_test) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) print('---------') naive = GaussianNB() naive.fit(X_train, y_train) naive_y_pred = naive.predict(X_test) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) print('---------') rule = SkopeRules() if multiclass is True: rule = OneVsRestClassifier(rule) rule.fit(X_train, y_train) rules_y_pred = rule.predict(X_test) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) return accuracy
def extract_rules(bin_label: pd.Interval, features_df: pd.DataFrame, class_encodings: pd.DataFrame, objective_name: str) -> list: """ Extract rules with given data and bin label. :param bin_label: :param features_df: :param class_encodings: :param objective_name: :return: List of extracted rules: (rule, precision, recall, support, result from, result to). """ rules_clf: SkopeRules = SkopeRules( max_depth_duplication=None, n_estimators=30, precision_min=0.2, recall_min=0.01, feature_names=features_df.columns.values, n_jobs=1) rules_clf.fit(features_df.values, class_encodings[objective_name] == bin_label) return [(rule[0], rule[1][0], rule[1][1], rule[1][2], bin_label.left, bin_label.right) for rule in rules_clf.rules_]
def BalanceSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] rus_balance = RandomUnderSampler( sampling_strategy=0.20) #truncate neg to 5*#pos sm_balance = SMOTE() #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_bal, y_bal = rus_balance.fit_resample(X_train, y_train) #BALANCED SAMPLE print('1.under:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal) print('2.over:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_bal, y_bal) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_bal, y_bal) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_bal, y_bal) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_bal, y_bal) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
def OverSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] sm = SMOTE() kf = KFold(n_splits=k, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_over, y_over = sm.fit_resample(X_train, y_train) #oversampled train set print('oversample:', X_over.shape, y_over.shape) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_over, y_over) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_over, y_over) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_over, y_over) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_over, y_over) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
from sklearn.datasets import load_iris from skrules import SkopeRules from matplotlib import pyplot as plt import seaborn as sns import pandas as pd dataset = load_iris() print(dataset) feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] clf = SkopeRules(max_depth_duplication=2, n_estimators=30, precision_min=0.3, recall_min=0.1, feature_names=feature_names) for idx, species in enumerate(dataset.target_names): X, y = dataset.data, dataset.target clf.fit(X, y == idx) rules = clf.rules_[0:3] print("Rules for iris", species) for rule in rules: print(rule) print() print(20*'=') print()
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from skrules import SkopeRules from sklearn.preprocessing import OneHotEncoder from collections import Counter # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = SkopeRules(max_depth_duplication=self.params["max_depth_duplication"], n_estimators=self.params["n_estimators"], precision_min=self.params["precision_min"], recall_min=self.params["recall_min"], max_samples=self.params["max_samples"], max_samples_features=self.params["max_samples_features"], max_depth=self.params["max_depth"], max_features=self.params["max_features"], min_samples_split=self.params["min_samples_split"], bootstrap=self.params["bootstrap"], bootstrap_features=self.params["bootstrap_features"], random_state=self.params["random_state"], feature_names=orig_cols) else: # Skopes doesn't work for regression loggerinfo(logger, "PASS, no skopes model") pass # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or (X_datatypes[col_count] == 'object')] self.X_numeric = [item for item in orig_cols if item not in self.X_categorical] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: loggerinfo(logger, "PCategorical encode") for colname in self.X_categorical: X[colname] = list(X[colname].fillna("Missing")) self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list(self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories)], axis=1) # Replace missing values with a missing value code if len(self.X_numeric) > 0: for colname in self.X_numeric: X[colname] = list(X[colname].fillna(-999)) model.fit(np.array(X), np.array(y)) # Find the rule list self.rule_list = model.rules_ # Calculate feature importances var_imp = [] for var in orig_cols: var_imp.append(sum(int(var in item[0]) for item in self.rule_list)) if max(var_imp) != 0: importances = list(np.array(var_imp) / max(var_imp)) else: importances = [1] * len(var_imp) pd.DataFrame(model.rules_, columns=['Rule', '(Precision, Recall, nb)']).to_csv( os.path.join(tmp_folder, 'Skope_rules.csv'), index=False) self.mean_target = np.array(sum(y) / len(y)) # Set model properties self.set_model_properties(model=model, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
class DecisionListClassifier(ClassifierMixin, ExplainerMixin): """ Decision List Classifier Currently a slight variant of SkopeRules from skope-rules. https://github.com/scikit-learn-contrib/skope-rules """ available_explanations = ["global", "local"] explainer_type = "model" def __init__(self, feature_names=None, feature_types=None, **kwargs): """ Initializes skope rules. Args: **kwargs: Keyword arguments to be passed to SkopeRules in skope-rules. """ self.feature_names = feature_names self.feature_types = feature_types self.kwargs = kwargs def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types ) self.feature_index_ = [ "feature_" + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules( self.sk_model_.rules_ ) self.global_selector = gen_global_selector( X, self.feature_names, self.feature_types, None ) return self def predict(self, X): """ Predicts on provided instances. Args: X: Numpy array for instances. Returns: Predicted class label per instance. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self.predict_proba(X) return self.classes_[np.argmax(scores, axis=1)] def _scores(self, X): df = pd.DataFrame(X, columns=self.feature_index_) selected_rules = self.internal_rules_ scores = np.ones(X.shape[0]) * np.inf for k, r in enumerate(selected_rules): matched_idx = list(df.query(r[0]).index) scores[matched_idx] = np.minimum(k, scores[matched_idx]) scores[np.isinf(scores)] = len(selected_rules) scores = scores.astype("int64") return scores def predict_proba(self, X): """ Provides probability estimates on provided instances. Args: X: Numpy array for instances. Returns: Probability estimate of instance for each class. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self._scores(X) prec_ar = np.array(self.prec_) return np.c_[1.0 - prec_ar[scores], prec_ar[scores]] def _extract_rules(self, rules): rules = deepcopy(rules) rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True)) rule_li = [] prec_li = [] recall_li = [] predict_li = [] features_dict = {feat: [] for feat in self.feature_names} def extract_orig_features(pattern, rule): feature_set = set() for m in re.finditer(pattern, rule): orig_feature = self.feature_map_[m.group(1)] feature_set.add(orig_feature) return feature_set for indx, rule_rec in enumerate(rules): rule = rule_rec[0] rule_round = " ".join( [ "{0:.2f}".format(float(x)) if x.replace(".", "", 1).isdigit() else x for x in rule.split(" ") ] ) pattern = r"(feature_[0-9]+)" feature_set = extract_orig_features(pattern, rule_round) rule_fix = re.sub( pattern, lambda m: self.feature_map_[m.group(1)], rule_round ) rule_li.append(rule_fix) prec_li.append(rule_rec[1][0]) recall_li.append(rule_rec[1][1]) predict_li.append(1.0) for feat in feature_set: features_dict[feat].append(indx) # Provide default rule rule_li.append("No Rules Triggered") prec_li.append(self.pos_ratio_) recall_li.append(1.0) predict_li.append(0.0) return rules, rule_li, prec_li, recall_li, features_dict def explain_local(self, X, y=None, name=None): if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) scores = self._scores(X) outcomes = self.predict(X) prob_scores = self.predict_proba(X) data_dicts = [] for idx, score in enumerate(scores): data_dict = { "type": "rule", "rule": [self.rules_[score]], "precision": [self.prec_[score]], "recall": [self.recall_[score]], "outcome": [outcomes[idx]], } data_dicts.append(data_dict) internal_obj = {"overall": None, "specific": data_dicts} selector = gen_local_selector(X, y, prob_scores[:, 1]) return RulesExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, ) def explain_global(self, name=None): """ Provides global explanation for model. Args: name: User-defined explanation name. Returns: An explanation object, visualizing feature-value pairs as horizontal bar chart. """ if name is None: name = gen_name_from_class(self) # Extract rules rules, prec, recall, feat_rule_map = ( self.rules_, self.prec_, self.recall_, self.feat_rule_map_, ) outcomes = [self.classes_[1]] * (len(self.rules_) - 1) # Add the zero case for the default rule outcomes.append(self.classes_[0]) overall_data_dict = { "type": "rule", "rule": rules, "precision": prec, "recall": recall, "outcome": outcomes, } data_dicts = [ { "type": "rule", "rule": [rules[i] for i in feat_rule_map[feature]], "precision": [prec[i] for i in feat_rule_map[feature]], "recall": [recall[i] for i in feat_rule_map[feature]], "outcome": [outcomes[i] for i in feat_rule_map[feature]], } for feature in self.feature_names ] internal_obj = {"overall": overall_data_dict, "specific": data_dicts} return RulesExplanation( "global", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=self.global_selector, )
class DecisionListClassifier(ClassifierMixin, ExplainerMixin): """ Decision List Classifier Currently a slight variant of SkopeRules from skope-rules. https://github.com/scikit-learn-contrib/skope-rules """ available_explanations = ['global', 'local'] explainer_type = 'model' def __init__(self, feature_names=None, feature_types=None, **kwargs): """ Initializes skope rules. Args: **kwargs: Keyword arguments to be passed to SkopeRules in skope-rules. """ self.feature_names = feature_names self.feature_types = feature_types self.kwargs = kwargs def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types) self.feature_index_ = [ 'feature_' + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = \ self._extract_rules(self.sk_model_.rules_) self.global_selector = gen_global_selector(X, self.feature_names, self.feature_types, None) return self def predict(self, X): """ Predicts on provided instances. Args: X: Numpy array for instances. Returns: Predicted class label per instance. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self.predict_proba(X) return self.classes_[np.argmax(scores, axis=1)] def _scores(self, X): df = pd.DataFrame(X, columns=self.feature_index_) selected_rules = self.internal_rules_ scores = np.ones(X.shape[0]) * np.inf for k, r in enumerate(selected_rules): matched_idx = list(df.query(r[0]).index) scores[matched_idx] = np.minimum(k, scores[matched_idx]) scores[np.isinf(scores)] = len(selected_rules) scores = scores.astype('int64') return scores def predict_proba(self, X): """ Provides probability estimates on provided instances. Args: X: Numpy array for instances. Returns: Probability estimate of instance for each class. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self._scores(X) prec_ar = np.array(self.prec_) return np.c_[1.0 - prec_ar[scores], prec_ar[scores]] def _extract_rules(self, rules): rules = deepcopy(rules) rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True)) rule_li = [] prec_li = [] recall_li = [] predict_li = [] features_dict = {feat: [] for feat in self.feature_names} def extract_orig_features(pattern, rule): feature_set = set() for m in re.finditer(pattern, rule): orig_feature = self.feature_map_[m.group(1)] feature_set.add(orig_feature) return feature_set for indx, rule_rec in enumerate(rules): rule = rule_rec[0] rule_round = ' '.join([ '{0:.2f}'.format(float(x)) if x.replace('.', '', 1).isdigit() else x for x in rule.split(' ') ]) pattern = r'(feature_[0-9]+)' feature_set = extract_orig_features(pattern, rule_round) rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)], rule_round) rule_li.append(rule_fix) prec_li.append(rule_rec[1][0]) recall_li.append(rule_rec[1][1]) predict_li.append(1.0) for feat in feature_set: features_dict[feat].append(indx) # Provide default rule rule_li.append('No Rules Triggered') prec_li.append(self.pos_ratio_) recall_li.append(1.0) predict_li.append(0.0) return rules, rule_li, prec_li, recall_li, features_dict def explain_local(self, X, y=None, name=None): if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) scores = self._scores(X) outcomes = self.predict(X) prob_scores = self.predict_proba(X) data_dicts = [] for idx, score in enumerate(scores): data_dict = { 'type': 'rule', 'rule': [self.rules_[score]], 'precision': [self.prec_[score]], 'recall': [self.recall_[score]], 'outcome': [outcomes[idx]], } data_dicts.append(data_dict) internal_obj = { 'overall': None, 'specific': data_dicts, } selector = gen_local_selector(X, y, prob_scores[:, 1]) return RulesExplanation('local', internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector) def explain_global(self, name=None): """ Provides global explanation for model. Args: name: User-defined explanation name. Returns: An explanation object, visualizing feature-value pairs as horizontal bar chart. """ if name is None: name = gen_name_from_class(self) # Extract rules rules, prec, recall, feat_rule_map = \ self.rules_, self.prec_, self.recall_, self.feat_rule_map_ outcomes = [self.classes_[1]] * (len(self.rules_) - 1) # Add the zero case for the default rule outcomes.append(self.classes_[0]) overall_data_dict = { 'type': 'rule', 'rule': rules, 'precision': prec, 'recall': recall, 'outcome': outcomes } data_dicts = [{ 'type': 'rule', 'rule': [rules[i] for i in feat_rule_map[feature]], 'precision': [prec[i] for i in feat_rule_map[feature]], 'recall': [recall[i] for i in feat_rule_map[feature]], 'outcome': [outcomes[i] for i in feat_rule_map[feature]], } for feature in self.feature_names] internal_obj = { 'overall': overall_data_dict, 'specific': data_dicts, } return RulesExplanation('global', internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=self.global_selector)
batch_size=1000) return neural_net # Train the network, then make predictions on the test set and print the results neural_network = compute_semi_supervised_learning(neural_network, X_train, Y_train) neural_network_pred = np.array(neural_network.predict_classes( np.array(X_test))) print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test) # ************* Rule Model: ************************ # Here we compare 3 nearest neighbour models on the validation set # First skope rules model rule_clf1 = SkopeRules(n_estimators=50, precision_min=0.2, recall_min=0.2, feature_names=feature_names) rule_clf1.fit(X_val, Y_val) rule_clf1_ypred = rule_clf1.predict(X_test_val) print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred, Y_test_val) # Second skope rules model rule_clf2 = SkopeRules(n_estimators=50, precision_min=0.2, recall_min=0.2, feature_names=feature_names) rule_clf2.fit(X_val, Y_val) rule_clf2_ypred = rule_clf2.predict(X_test_val) print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred, Y_test_val) # Third skope rules model
def train(self, feature_names, symbol_vars): #model = xgboost.XGBClassifier(max_depth=7, n_estimators=10) #class_w=class_weight.compute_class_weight("balanced",np.unique(y),y) self.pddata['Y'] = (self.pddata['Y'] == self.args.label) self.pddata.to_csv( os.path.join( self.outdir, self.outname + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M.csv"))) traindata = self.pddata.sample(frac=0.8, replace=True) traindata = traindata.reset_index(drop=1) sample_weight = class_weight.compute_sample_weight( "balanced", traindata['Y']) X = traindata.iloc[:, 1:].to_numpy() y = traindata['Y'] self.sample_weight = sample_weight data = xgboost.DMatrix(data=X, label=y, feature_names=feature_names, feature_types=['int'] * X.shape[-1], weight=sample_weight) self.feature_names = feature_names d = X.shape[-1] feature_combination = [] for sym in symbol_vars: print(sym) if len(symbol_vars[sym]) > 0: feature_combination.append(symbol_vars[sym]) import pickle if self.in_param_file: with open(self.in_param_file) as f: params = pickle.load(f) model = xgboost.train( params=params, dtrain=data, num_boost_round=self.args.ntrees, ) else: t_clf = self.tune() model = t_clf.best_estimator_._Booster params = t_clf.best_params_ #params['rate_drop']=0.1 #params['skip_drop']=0.5 #params['normalize_type']='tree' with open(self.param_file, 'wb') as f: pickle.dump(params, f) print(self.linear) if self.args.debug: embed() model.dump_model(self.modelfile, with_stats=True) clf = SkopeRules(max_depth_duplication=self.args.depth, precision_min=0.6, recall_min=0.005, verbose=1, feature_names=feature_names) evaldata = self.pddata.sample(frac=0.3, replace=True) evaldata = evaldata.reset_index(drop=1) eval_sample_weight = class_weight.compute_sample_weight( "balanced", evaldata['Y']) clf.fit_xgbmodel(evaldata, model, eval_sample_weight) print("end fit_xgbmodel") clf.rules_.sort(key=lambda x: x[1], reverse=True) rules = {} for i in range(len(clf.rules_)): r = trim_rule(clf.rules_[i], evaldata, eval_sample_weight) rules[r[0]] = r[1] rulelist = [] for r in rules: rulelist.append([r, rules[r]]) rulelist.sort(key=lambda x: x[1], reverse=True) usedLinear = {} toLatex(rulelist, self.rule_latex) for lname in self.linear: if any(lname in r[0] for r in rulelist): usedLinear[lname] = self.linear[lname] print("%s=%s" % (lname, usedLinear[lname][0])) sym_vars = symbol_vars var_sizes = [ len(sym_vars['c']), len(sym_vars['I']), len(sym_vars['Ialt']), len(sym_vars['s']), len(sym_vars['salt']) ] allr1, allr = simplify_rules(clf.rules_) #cnf=tocnffile(var_sizes,allr1,self.cnffile) allrscore = xgbtree_rule_perf(str(allr1), evaldata, evaldata['Y'], eval_sample_weight) print("all r=", simplify(~allr), allrscore) self.saverules(clf.rules_, [simplify(allr), allrscore], self.rulefile) if self.args.debug: embed()
# a number of rules, each seeking for high precision on a potentially small # area of detection (low recall). ############################################################################### # Getting rules with skrules # .................. # # This part shows how SkopeRules can be fitted to detect credit defaults. # Performances are compared with the random forest model previously trained. # fit the model clf = SkopeRules( similarity_thres=.9, max_depth=3, max_features=0.5, max_samples_features=0.5, random_state=rng, n_estimators=30, feature_names=feature_names, recall_min=0.02, precision_min=0.6 ) clf.fit(X_train, y_train) # in the separate_rules_score method, a score of k means that rule number k # vote positively, but not rules 1, ..., k-1. It will allow us to plot # performance of each rule separately on ROC and PR plots. scoring = clf.separate_rules_score(X_test) print(str(len(clf.rules_)) + ' rules have been built.') print('The most precise rules are the following:') print(clf.rules_[:5]) curves = [roc_curve, precision_recall_curve] xlabels = ['False Positive Rate', 'Recall (True Positive Rate)']
# area of detection (low recall). ############################################################################### # Getting rules with skrules # .......................... # # This part shows how SkopeRules can be fitted to detect credit defaults. # Performances are compared with the random forest model previously trained. # fit the model clf = SkopeRules(max_depth_duplication=3, max_depth=3, max_features=0.5, max_samples_features=0.5, random_state=rng, n_estimators=20, feature_names=feature_names, recall_min=0.04, precision_min=0.6) clf.fit(X_train, y_train) # in the score_top_rules method, a score of k means that rule number k # vote positively, but not rules 1, ..., k-1. It will allow us to plot # performance of each rule separately on the ROC and PR plots. scoring = clf.score_top_rules(X_test) print(str(len(clf.rules_)) + ' rules have been built.') print('The 5 most precise rules are the following:') for rule in clf.rules_[:5]: print(rule[0])
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1) X1 = feat_selector.fit_transform(X_bal, y_bal) from sklearn.feature_selection import SelectKBest, f_classif X2 = SelectKBest(f_classif, k=20).fit_transform(X_bal, y_bal) #no feature sel for train_index, test_index in kf.split(X_bal): X_train, X_test = X_bal[train_index], X_bal[test_index] y_train, y_test = y_bal[train_index], y_bal[test_index] #test set neigh = KNeighborsClassifier() ## tree = DecisionTreeClassifier() naive = GaussianNB() rule = SkopeRules(feature_names= data.columns.to_list()[0:18]) neigh.fit(X_train, y_train) y_pred = neigh.predict(X_test) accuracy_no['neigh'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) tree.fit(X_train, y_train) y_pred = tree.predict(X_test) accuracy_no['tree'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) naive.fit(X_train, y_train) y_pred = naive.predict(X_test) accuracy_no['naive'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) rule.fit(X_train, y_train) y_pred = rule.predict(X_test)
def Sample2c(data, model): #use sampling method to rebalance data and train each model #model in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost'] from sklearn.model_selection import KFold from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from skrules import SkopeRules from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.multiclass import OneVsRestClassifier if model not in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost']: print('model not support') return X, y = npData2c(data) acc_5c = [] #store average result(10 folds) confusion_mat_5c = [] precision_5c = [] recall_5c = [] fscore_5c = [] precision_all = [] #store raw score for class 4 and 5(1o for each model) recall_all = [] fscore_all = [] kf = KFold(n_splits=10, shuffle=True) #balanced sampling, oversample 4 and 5 a = 0 p, r, f = np.zeros([2]), np.zeros([2]), np.zeros([2]) c = np.zeros([2, 2]) P = np.zeros([10, 2]) R = np.zeros([10, 2]) F = np.zeros([10, 2]) rus_balance = RandomUnderSampler(sampling_strategy={0: 30000}) sm_balance = SMOTE(sampling_strategy={1: 15000}) i = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = rus_balance.fit_resample(X_train, y_train) X_train, y_train = sm_balance.fit_resample(X_train, y_train) if model == 'SVC': clf = SVC(kernel='rbf', gamma='scale') #################### elif model == 'tree': clf = DecisionTreeClassifier() #################### elif model == 'kNN': clf = KNeighborsClassifier() #################### elif model == 'rule': rule = SkopeRules() clf = OneVsRestClassifier(rule) elif model == 'RF': clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) elif model == 'Adaboost': clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) a += accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) c += confusion_matrix(y_test, y_pred) prf = precision_recall_fscore_support(y_test, y_pred) p += prf[0] r += prf[1] f += prf[2] P[i][0], P[i][1] = prf[0][0], prf[0][1] R[i][0], R[i][1] = prf[1][0], prf[1][1] F[i][0], F[i][1] = prf[2][0], prf[2][1] i += 1 print( accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) print(confusion_matrix(y_test, y_pred)) print(precision_recall_fscore_support(y_test, y_pred)) acc_5c.append(a / 10) confusion_mat_5c.append(c / 10) precision_5c.append(p / 10) recall_5c.append(r / 10) fscore_5c.append(f / 10) precision_all.append(P) recall_all.append(R) fscore_all.append(F) return (acc_5c, confusion_mat_5c, precision_5c, recall_5c, fscore_5c, [precision_all, recall_all, fscore_all])
class DecisionListClassifier(ClassifierMixin, ExplainerMixin): # pragma: no cover """ Decision List Classifier Currently a slight variant of SkopeRules from skope-rules. https://github.com/scikit-learn-contrib/skope-rules """ available_explanations = ["global", "local"] explainer_type = "model" def __init__(self, feature_names=None, feature_types=None, **kwargs): """ Initializes class. Args: feature_names: List of feature names. feature_types: List of feature types. **kwargs: Kwargs passed to wrapped SkopeRules at initialization time. """ self.feature_names = feature_names self.feature_types = feature_types self.kwargs = kwargs def fit(self, X, y): """ Fits model to provided instances. Args: X: Numpy array for training instances. y: Numpy array as training labels. Returns: Itself. """ from skrules import SkopeRules as SR X, y, self.feature_names, self.feature_types = unify_data( X, y, self.feature_names, self.feature_types) self.feature_index_ = [ "feature_" + str(i) for i, v in enumerate(self.feature_names) ] self.feature_map_ = { v: self.feature_names[i] for i, v in enumerate(self.feature_index_) } self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs) self.classes_, y = np.unique(y, return_inverse=True) self.sk_model_.fit(X, y) self.pos_ratio_ = np.mean(y) # Extract rules ( self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_, ) = self._extract_rules(self.sk_model_.rules_) self.global_selector = gen_global_selector(X, self.feature_names, self.feature_types, None) return self def predict(self, X): """ Predicts on provided instances. Args: X: Numpy array for instances. Returns: Predicted class label per instance. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self.predict_proba(X) return self.classes_[np.argmax(scores, axis=1)] def _scores(self, X): df = pd.DataFrame(X, columns=self.feature_index_) selected_rules = self.internal_rules_ scores = np.ones(X.shape[0]) * np.inf for k, r in enumerate(selected_rules): matched_idx = list(df.query(r[0]).index) scores[matched_idx] = np.minimum(k, scores[matched_idx]) scores[np.isinf(scores)] = len(selected_rules) scores = scores.astype("int64") return scores def predict_proba(self, X): """ Provides probability estimates on provided instances. Args: X: Numpy array for instances. Returns: Probability estimate of instance for each class. """ X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types) scores = self._scores(X) prec_ar = np.array(self.prec_) return np.c_[1.0 - prec_ar[scores], prec_ar[scores]] def _extract_rules(self, rules): rules = deepcopy(rules) rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True)) rule_li = [] prec_li = [] recall_li = [] predict_li = [] features_dict = {feat: [] for feat in self.feature_names} def extract_orig_features(pattern, rule): feature_set = set() for m in re.finditer(pattern, rule): orig_feature = self.feature_map_[m.group(1)] feature_set.add(orig_feature) return feature_set for indx, rule_rec in enumerate(rules): rule = rule_rec[0] rule_round = " ".join([ "{0:.2f}".format(float(x)) if x.replace(".", "", 1).isdigit() else x for x in rule.split(" ") ]) pattern = r"(feature_[0-9]+)" feature_set = extract_orig_features(pattern, rule_round) rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)], rule_round) rule_li.append(rule_fix) prec_li.append(rule_rec[1][0]) recall_li.append(rule_rec[1][1]) predict_li.append(1.0) for feat in feature_set: features_dict[feat].append(indx) # Provide default rule rule_li.append("No Rules Triggered") prec_li.append(self.pos_ratio_) recall_li.append(1.0) predict_li.append(0.0) return rules, rule_li, prec_li, recall_li, features_dict def explain_local(self, X, y=None, name=None): """ Provides local explanations for provided instances. Args: X: Numpy array for X to explain. y: Numpy vector for y to explain. name: User-defined explanation name. Returns: An explanation object. """ if name is None: name = gen_name_from_class(self) X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types) scores = self._scores(X) outcomes = self.predict(X) prob_scores = self.predict_proba(X) data_dicts = [] for idx, score in enumerate(scores): data_dict = { "type": "rule", "rule": [self.rules_[score]], "precision": [self.prec_[score]], "recall": [self.recall_[score]], "outcome": [outcomes[idx]], } data_dicts.append(data_dict) internal_obj = {"overall": None, "specific": data_dicts} selector = gen_local_selector(data_dicts, is_classification=True) return RulesExplanation( "local", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=selector, ) def explain_global(self, name=None): """ Provides global explanation for model. Args: name: User-defined explanation name. Returns: An explanation object. """ if name is None: name = gen_name_from_class(self) # Extract rules rules, prec, recall, feat_rule_map = ( self.rules_, self.prec_, self.recall_, self.feat_rule_map_, ) outcomes = [self.classes_[1]] * (len(self.rules_) - 1) # Add the zero case for the default rule outcomes.append(self.classes_[0]) overall_data_dict = { "type": "rule", "rule": rules, "precision": prec, "recall": recall, "outcome": outcomes, } data_dicts = [{ "type": "rule", "rule": [rules[i] for i in feat_rule_map[feature]], "precision": [prec[i] for i in feat_rule_map[feature]], "recall": [recall[i] for i in feat_rule_map[feature]], "outcome": [outcomes[i] for i in feat_rule_map[feature]], } for feature in self.feature_names] internal_obj = {"overall": overall_data_dict, "specific": data_dicts} return RulesExplanation( "global", internal_obj, feature_names=self.feature_names, feature_types=self.feature_types, name=name, selector=self.global_selector, )
def Sample5c(X, y, model): #use sampling method to rebalance data and train each model #model in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost'] if model not in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost']: print('model not support') return acc_5c = [] #store average result(10 folds) confusion_mat_5c = [] precision_5c = [] recall_5c = [] fscore_5c = [] precision_all = [] #store raw score for class 4 and 5(1o for each model) recall_all = [] fscore_all = [] kf = KFold(n_splits=10, shuffle=True) # only down sample majority class(1,2,3,4,) a = 0 p, r, f = np.zeros([5]), np.zeros([5]), np.zeros([5]) c = np.zeros([5, 5]) P = np.zeros([10, 2]) R = np.zeros([10, 2]) F = np.zeros([10, 2]) rus_balance = RandomUnderSampler(sampling_strategy={ 1: 2500, 2: 2500, 3: 2500, 4: 2500 }) i = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = rus_balance.fit_resample(X_train, y_train) if model == 'SVC': clf = SVC(kernel='rbf', gamma='scale') #################### elif model == 'tree': clf = DecisionTreeClassifier() #################### elif model == 'kNN': clf = KNeighborsClassifier() #################### elif model == 'rule': rule = SkopeRules() clf = OneVsRestClassifier(rule) elif model == 'RF': clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) elif model == 'Adaboost': clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) a += accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) c += confusion_matrix(y_test, y_pred) prf = precision_recall_fscore_support(y_test, y_pred) p += prf[0] r += prf[1] f += prf[2] P[i][0], P[i][1] = prf[0][3], prf[0][4] R[i][0], R[i][1] = prf[1][3], prf[1][4] F[i][0], F[i][1] = prf[2][3], prf[2][4] i += 1 print( accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) print(confusion_matrix(y_test, y_pred)) print(precision_recall_fscore_support(y_test, y_pred)) acc_5c.append(a / 10) confusion_mat_5c.append(c / 10) precision_5c.append(p / 10) recall_5c.append(r / 10) fscore_5c.append(f / 10) precision_all.append(P) recall_all.append(R) fscore_all.append(F) ################## #balanced sampling, oversample 4 and 5 a = 0 p, r, f = np.zeros([5]), np.zeros([5]), np.zeros([5]) c = np.zeros([5, 5]) P = np.zeros([10, 2]) R = np.zeros([10, 2]) F = np.zeros([10, 2]) rus_balance = RandomUnderSampler(sampling_strategy={ 1: 10000, 2: 10000, 3: 10000 }) sm_balance = SMOTE(sampling_strategy={4: 10000, 5: 5000}) i = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = rus_balance.fit_resample(X_train, y_train) X_train, y_train = sm_balance.fit_resample(X_train, y_train) if model == 'SVC': clf = SVC(kernel='rbf', gamma='scale') #################### elif model == 'tree': clf = DecisionTreeClassifier() #################### elif model == 'kNN': clf = KNeighborsClassifier() #################### elif model == 'rule': rule = SkopeRules() clf = OneVsRestClassifier(rule) elif model == 'RF': clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) elif model == 'Adaboost': clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) a += accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) c += confusion_matrix(y_test, y_pred) prf = precision_recall_fscore_support(y_test, y_pred) p += prf[0] r += prf[1] f += prf[2] P[i][0], P[i][1] = prf[0][3], prf[0][4] R[i][0], R[i][1] = prf[1][3], prf[1][4] F[i][0], F[i][1] = prf[2][3], prf[2][4] i += 1 print( accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)) print(confusion_matrix(y_test, y_pred)) print(precision_recall_fscore_support(y_test, y_pred)) acc_5c.append(a / 10) confusion_mat_5c.append(c / 10) precision_5c.append(p / 10) recall_5c.append(r / 10) fscore_5c.append(f / 10) precision_all.append(P) recall_all.append(R) fscore_all.append(F) return (acc_5c, confusion_mat_5c, precision_5c, recall_5c, fscore_5c, [precision_all, recall_all, fscore_all])