def test_f1_score(): rule0 = Rule('a > 0', (0, 0, 0)) rule1 = Rule('a > 0', (0.5, 0.5, 0)) rule2 = Rule('a > 0', (0.5, 0, 0)) assert f1_score(rule0) == 0 assert f1_score(rule1) == 0.5 assert f1_score(rule2) == 0
def split(rule: Rule) -> List[Rule]: if len(rule.agg_dict) == 1: return [rule] else: indv_rule_strs = list(map(lambda x: ' '.join(x), rule.terms)) indv_rules = list(map(lambda x: Rule(x), indv_rule_strs)) return indv_rules
def _add_OOB_scores_to_rules(self, X, y, rules_from_tree, in_bag_samples, features): # Create mask for OOB samples mask = ~in_bag_samples if sum(mask) == 0: warn( "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1." ) mask = in_bag_samples # XXX todo: idem without dataframe X_oob = pandas.DataFrame( (X[mask, :])[:, features], columns=np.array(self.feature_names_)[features]) if X_oob.shape[1] <= 1: # otherwise pandas bug (cf. issue #16363) return [] y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: rules_from_tree = [ Rule(r, args=self._eval_rule_perf(r, X_oob, y_oob)) for r in set(rules_from_tree) ] return rules_from_tree
def test_similarity_tree(): # Test that rules are well splitted rules = [Rule("a <= 2 and b > 45 and c <= 3 and a > 4", args=(1, 1, 0)), Rule("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)), Rule("a > 2 and b > 45", (0.5, 0.3, 0)), Rule("a > 2 and b > 40", (0.5, 0.2, 0)), Rule("a <= 2 and b <= 45", (1, 1, 0)), Rule("a > 2 and c <= 3", (1, 1, 0)), Rule("b > 45", (1, 1, 0))] sk = SkopeRulesClassifier(max_depth_duplication=2) rulesets = find_similar_rulesets(rules, max_depth_duplication=2) # Assert some couples of rules are in the same bag idx_bags_rules = [] for idx_rule, r in enumerate(rules): idx_bags_for_rule = [] for idx_bag, bag in enumerate(rulesets): if r in bag: idx_bags_for_rule.append(idx_bag) idx_bags_rules.append(idx_bags_for_rule) assert idx_bags_rules[0] == idx_bags_rules[1] assert not idx_bags_rules[0] == idx_bags_rules[2] # Assert the best rules are kept final_rules = deduplicate(rules, sk.max_depth_duplication) assert rules[0] in final_rules assert rules[2] in final_rules assert not rules[3] in final_rules
def score_lasso(X, y, rules: List[str], alphas=None, cv=3, prediction_task='regression', max_rules=2000, random_state=None) -> Tuple[List[Rule], List[float], float]: if alphas is None: if prediction_task == 'regression': alphas = _alpha_grid(X, y) elif prediction_task == 'classification': alphas = [1 / alpha for alpha in np.logspace(-4, 4, num=10, base=10)] coef_zero_threshold = 1e-6 / np.mean(np.abs(y)) mse_cv_scores = [] nonzero_rule_coefs_count = [] kf = KFold(cv) # alphas are sorted from most reg. to least reg. for alpha in alphas: if prediction_task == 'regression': m = Lasso(alpha=alpha, random_state=random_state) else: m = LogisticRegression(penalty='l1', C=1/alpha, solver='liblinear') mse_cv = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] m.fit(X_train, y_train) mse_cv += np.mean((m.predict(X_test) - y_test) ** 2) m.fit(X, y) rule_count = np.sum(np.abs(m.coef_.flatten()) > coef_zero_threshold) if rule_count > max_rules: break nonzero_rule_coefs_count.append(rule_count) mse_cv_scores.append(mse_cv / cv) best_alpha = alphas[np.argmin(mse_cv_scores)] if prediction_task == 'regression': lscv = Lasso(alpha=best_alpha, random_state=random_state, max_iter=2000) else: lscv = LogisticRegression(penalty='l1', C=1/best_alpha, solver='liblinear', random_state=random_state, max_iter=200) lscv.fit(X, y) coef_ = lscv.coef_.flatten() coefs = list(coef_[:-len(rules)]) support = np.sum(X[:, -len(rules):], axis=0) / X.shape[0] nonzero_rules = [] for r, w, s in zip(rules, coef_[-len(rules):], support): if abs(w) > coef_zero_threshold: nonzero_rules.append(Rule(r, args=[w], support=s)) coefs.append(w) return nonzero_rules, coefs, lscv.intercept_
def score_linear(X, y, rules: List[str], penalty='l1', prediction_task='regression', max_rules=30, alpha=None, random_state=None) -> Tuple[List[Rule], List[float], float]: if alpha is not None and max_rules is None: final_alpha = alpha elif max_rules is not None and alpha is None: final_alpha = get_best_alpha_under_max_rules( X, y, rules, penalty=penalty, prediction_task=prediction_task, max_rules=max_rules, random_state=random_state) else: raise ValueError("max_rules and alpha cannot be used together") if prediction_task == 'regression': lscv = Lasso(alpha=final_alpha, random_state=random_state, max_iter=2000) else: lscv = LogisticRegression(penalty=penalty, C=1 / final_alpha, solver='liblinear', random_state=random_state, max_iter=200) lscv.fit(X, y) coef_ = lscv.coef_.flatten() coefs = list(coef_[:coef_.shape[0] - len(rules)]) support = np.sum(X[:, -len(rules):], axis=0) / X.shape[0] nonzero_rules = [] coef_zero_threshold = 1e-6 / np.mean(np.abs(y)) for r, w, s in zip(rules, coef_[-len(rules):], support): if abs(w) > coef_zero_threshold: nonzero_rules.append(Rule(r, args=[w], support=s)) coefs.append(w) return nonzero_rules, coefs, lscv.intercept_
def score_lasso(X, y, rules: List[str], alphas=None, cv=3, max_rules=2000, random_state=None) -> Tuple[List[Rule], Lasso]: if alphas is None: alphas = _alpha_grid(X, y) coef_zero_threshold = 1e-6 / np.mean(np.abs(y)) mse_cv_scores = [] nonzero_rule_coefs_count = [] kf = KFold(cv) for alpha in alphas: # alphas are sorted from largest to smallest m = Lasso(alpha=alpha, random_state=random_state) mse_cv = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] m.fit(X_train, y_train) mse_cv += np.mean((m.predict(X_test) - y_test)**2) m.fit(X, y) rule_count = sum(np.abs(m.coef_) > coef_zero_threshold) if rule_count > max_rules: break nonzero_rule_coefs_count.append(rule_count) mse_cv_scores.append(mse_cv / cv) best_alpha = alphas[np.argmin(mse_cv_scores)] lscv = Lasso(alpha=best_alpha, random_state=random_state, max_iter=2000) lscv.fit(X, y) coefs = list(lscv.coef_[:-len(rules)]) support = np.sum(X[:, -len(rules):], axis=0) / X.shape[0] nonzero_rules = [] for r, w, s in zip(rules, lscv.coef_[-len(rules):], support): if abs(w) > coef_zero_threshold: nonzero_rules.append(Rule(r, args=[w], support=s)) coefs.append(w) return nonzero_rules, coefs, lscv.intercept_
def score_precision_recall(X, y, rules: List[List[str]], samples: List[List[int]], features: List[List[int]], feature_names: List[str], oob: bool = True) -> List[Rule]: scored_rules = [] for curr_rules, curr_samples, curr_features in zip(rules, samples, features): # Create mask for OOB samples mask = ~indices_to_mask(curr_samples, X.shape[0]) if sum(mask) == 0: if oob: warn( "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1." ) mask = curr_samples # XXX todo: idem without dataframe X_oob = pd.DataFrame( (X[mask, :])[:, curr_features], columns=np.array(feature_names)[curr_features] ) if X_oob.shape[1] <= 1: # otherwise pandas bug (cf. issue #16363) return [] y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: scored_rules += [ Rule(r, args=_eval_rule_perf(r, X_oob, y_oob)) for r in set(curr_rules) ] return scored_rules
def score_lasso(X, y, rules: List[str], Cs, cv, random_state) -> Tuple[List[Rule], LassoCV]: if Cs is None: n_alphas = 100 alphas = None elif hasattr(Cs, "__len__"): n_alphas = None alphas = 1. / Cs else: n_alphas = Cs alphas = None lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=cv, random_state=random_state) lscv.fit(X, y) rules = [ Rule(r, args=[w]) for r, w in zip(rules, lscv.coef_[-len(rules):]) ] return rules, lscv
def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]: # Factorize rules before semantic tree filtering rules_ = [tuple(rule) for rule in rules] rules_dict = {} # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if score[0] >= precision_min and score[1] >= recall_min: if rule in rules_dict: # update the score to the new mean c = rules_dict[rule][2] + 1 b = rules_dict[rule][1] + 1. / c * ( score[1] - rules_dict[rule][1]) a = rules_dict[rule][0] + 1. / c * ( score[0] - rules_dict[rule][0]) rules_dict[rule] = (a, b, c) else: rules_dict[rule] = (score[0], score[1], 1) rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) return [Rule(rule, args=scores) for rule, scores in rule_tuple_list]
def fit(self, X, y, feature_names=None, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = list(self.feature_dict_.keys()) self.feature_names = list(self.feature_dict_.values()) n_train = y.shape[0] w = np.ones(n_train) / n_train self.estimators_ = [] self.estimator_weights_ = [] self.estimator_errors_ = [] self.feature_names = feature_names for _ in range(self.n_estimators): # Fit a classifier with the specific weights clf = self.estimator() clf.fit(X, y, sample_weight=w) # uses w as the sampling weight! preds = clf.predict(X) # Indicator function miss = preds != y # Equivalent with 1/-1 to update weights miss2 = np.ones(miss.size) miss2[~miss] = -1 # Error err_m = np.dot(w, miss) / sum(w) if err_m < 1e-3: return self # Alpha alpha_m = 0.5 * np.log((1 - err_m) / float(err_m)) # New weights w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2])) self.estimators_.append(deepcopy(clf)) self.estimator_weights_.append(alpha_m) self.estimator_errors_.append(err_m) rules = [] for est, est_weight in zip(self.estimators_, self.estimator_weights_): if type(clf) == DecisionTreeClassifier: est_rules_values = tree_to_rules(est, self.feature_placeholders, prediction_values=True) est_rules = list(map(lambda x: x[0], est_rules_values)) # BRS scores are difference between class 1 % and class 0 % in a node est_values = np.array(list(map(lambda x: x[1], est_rules_values))) rule_scores = (est_values[:, 1] - est_values[:, 0]) / est_values.sum(axis=1) compos_score = est_weight * rule_scores rules += [Rule(r, args=[w]) for (r, w) in zip(est_rules, compos_score)] if type(clf) == SlipperClassifier: # SLIPPER uses uniform confidence over in rule observations est_rule = dict_to_rule(est.rule, est.feature_dict) rules += [Rule(est_rule, args=[est_weight])] self.rules_without_feature_names_ = rules self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self
def fit(self, X, y, feature_names: list = None, undiscretized_features=[], verbose=False): """Fit rule lists to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_names : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception( "Only binary classification is supported at this time!") X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_in_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = np.array(list(self.feature_dict_.keys())) self.feature_names = np.array(list(self.feature_dict_.values())) itemsets, self.discretizer = extract_fpgrowth( X, y, feature_names=self.feature_placeholders, minsupport=self.minsupport, maxcardinality=self.maxcardinality, undiscretized_features=undiscretized_features, disc_strategy=self.disc_strategy, disc_kwargs=self.disc_kwargs, verbose=verbose) X_df_onehot = self.discretizer.transform(X) # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [ c if x == 1 else '' for x in list(X_df_onehot[c]) ] X = [{}] * (len(itemsets) + 1) X[0] = set(range( len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([ i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi) ]) # now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = np.array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all) permsdic = defaultdict( default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, self.maxcardinality, permsdic, self.burnin, self.n_chains, [None] * self.n_chains, verbose=self.verbose, seed=self.random_state) # Merge the chains permsdic = merge_chains(res) # The point estimate, BRL-point self.d_star = get_point_estimate( permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality, self.listlengthprior, self.listwidthprior, verbose=self.verbose) # get the point estimate if self.d_star: # Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True) self.final_itemsets = np.array(self.itemsets, dtype=object)[self.d_star] rule_strs = itemsets_to_rules(self.final_itemsets) self.rules_without_feature_names_ = [Rule(r) for r in rule_strs] self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self