def test_remove_feature(): X, y = make_classification(n_samples=1000, n_features=100, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, class_sep=1.0, hypercube=True, scale=2.0, shuffle=True, random_state=0) nb = CustomNaiveBayes(encode_data=True) nb.fit(X, y) nb.remove_feature(0) independent = nb.indepent_term_ smoothed_log_counts_ = nb.smoothed_log_counts_ removed = nb.predict_proba(np.delete(X, 0, axis=1)) nb.fit(np.delete(X, 0, axis=1), y) og = nb.predict_proba(np.delete(X, 0, axis=1)) assert np.allclose(nb.smoothed_log_counts_, smoothed_log_counts_) assert np.allclose(nb.indepent_term_, independent) assert np.allclose(og, removed)
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin, BaseEstimator): """First proposal: Hybrid-Ranker Wrapper. Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1 (1 operator, 2 operands), using XOR, AND and OR operator. The steps are: - Find out combinations of values in database of every pair of features Xi, Xj: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a'),(2,'b'),(3,'c'),(2,'a')] - Apply operator to every combination: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'), (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'), (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')] - Add original variables to the list - Evaluate SU for every value in the list, and rank them - Go over the list following one of the two strategies proposed and evaluate the subset based on a leave-one-out cross-validation with the NaiveBayes classifier. Parameters ---------- strategy : str {eager,skip} After the ranking is built if the eager strategy is chosen we stop considering attributes when there is no improvement from one iteration to the next block_size : int, default=1 Number of features that are added in each iteration encode_data : boolean Whether or not to encode the received data. If set to false the classifier expects data to be encoded with an ordinal encoder. verbose : {boolean,int} If set to true it displays information of the remaining time and inside variables. operators : array-like, deafult = ("XOR","AND","OR") Operators used for the constructed features. max_features : int, deafult = inf Maximum number of features to include in the selected subset max_iterations : int, deafult = inf Maximum number of iterations in the wrapper step. use_graph : bool, default = False Generate Ranking from features obtained from the pruned-graph of the ACO algorithm. (Experimentation not carried out) use_initials: bool, default = False Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand. Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. all_feature_constructors: array-like List of FeatureConstructor objects with all the possible logical features symmetrical_uncertainty_rank: array-like SU for every feature in all_feature_constructors rank : array-like Array of indexes corresponding to the sorted SU rank (in descending order). final_feature_constructors: Selected feature subset (list of constructors) classifier: NaiveBayes Classifier used in the wrapper and to perform predictions after fitting. """ def __init__(self, strategy="eager", block_size=10, encode_data=True, n_intervals=5, verbose=0, operators=("AND", "OR", "XOR"), max_features=float("inf"), max_iterations=float("inf"), metric="accuracy", use_initials=False, max_err=0, prune=None, use_graph=False): self.strategy = strategy self.block_size = max(block_size, 1) self.encode_data = encode_data self.verbose = verbose self.operators = operators self.max_features = max_features self.max_iterations = max_iterations self.n_intervals = n_intervals self.metric = metric self.max_err = max_err self.use_initials = use_initials self.prune = prune self.use_graph = use_graph allowed_strategies = ("eager", "skip") if self.strategy not in allowed_strategies: raise ValueError("Unknown operator type: %s, expected one of %s." % (self.strategy, allowed_strategies)) def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self def predict(self, X): X, _ = self.transform(X) if self.encode_data: return self.class_encoder_.inverse_transform( self.classifier.predict(X)) return self.classifier.predict(X) def reset_evaluation(self): # Reset the memoize evaluations self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out) def predict_proba(self, X): X, _ = self.transform(X) return self.classifier.predict_proba(X) def score(self, X, y): X, y = self.transform(X, y) return self.classifier.score(X, y) def filter_features(self, X, y): '''After the rank is built this perform the greedy wrapper search''' check_is_fitted(self) self.classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) current_score = np.NINF first_iteration = True current_features = [] current_data = None if self.use_initials: # Original Features have already been taken into account rank_iter = filter( lambda x: not isinstance(self.all_feature_constructors[x], DummyFeatureConstructor), iter(self.rank)) # Deep copy to avoid issues when modifying the list current_features = deepcopy(self.initial_backward_features) current_data = np.concatenate( [f.transform(X) for f in current_features], axis=1) # Get initial LOO score current_score = self.evaluate_leave_one_out_cross_val( self.classifier, current_features, current_data, y, fit=True) else: # Iterator over the sorted list of indexes rank_iter = iter(self.rank) if self.verbose: progress_bar = tqdm(total=len(self.rank), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') iteration = 0 iterations_without_improvements = 0 # Loop for including {block size} elements at a time # Rank is an iterator, so the for loop is not sequential! for feature_constructor_index in rank_iter: iteration += 1 if self.verbose: progress_bar.set_postfix({ "n_features": len(current_features), "score": current_score }) progress_bar.update(1) progress_bar.refresh() # Add block size features new_X = [ self.all_feature_constructors[feature_constructor_index]. transform(X) ] selected_features = [ self.all_feature_constructors[feature_constructor_index] ] for _ in range(self.block_size - 1): try: index = next(rank_iter) selected_features.append( self.all_feature_constructors[index]) new_X.append( self.all_feature_constructors[index].transform(X)) if self.verbose: progress_bar.update(1) progress_bar.refresh() except: # Block size does not divide the number of elements in the rank. The search is halted break # Evaluate features new_X = np.concatenate(new_X, axis=1) if iteration == 1 and not self.use_initials: current_data = new_X current_score = self.evaluate_leave_one_out_cross_val( self.classifier, selected_features, current_data, y, fit=True) current_features = selected_features first_iteration = False if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break continue data = np.concatenate([current_data, new_X], axis=1) self.classifier.add_features(new_X, y) # LOO evaluation score = self.evaluate_leave_one_out_cross_val(self.classifier, current_features + selected_features, data, y, fit=False) if score > current_score: current_score = score current_data = data current_features.extend(selected_features) iterations_without_improvements = 0 else: iterations_without_improvements += 1 # Remove last added block for feature_index_to_remove in range( data.shape[1], data.shape[1] - new_X.shape[1], -1): self.classifier.remove_feature(feature_index_to_remove - 1) if self.strategy == "eager" and self.max_err < iterations_without_improvements: # Stops as soon as no impovement break if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break if self.verbose: progress_bar.close() print( f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}" ) self.final_feature_constructors = current_features return self def transform(self, X, y=None): check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: X = self.feature_encoder_.transform(X) if y is not None: y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() new_X = [] for feature_constructor in self.final_feature_constructors: new_X.append(feature_constructor.transform(X)) return np.concatenate(new_X, axis=1), y
class PazzaniWrapperNB(PazzaniWrapper): ''''Optimized version of Pazzani's wrapper for the Naive Bayes classifier. LOO cross validation Update, add, delete features ''' def __init__(self, seed=None, strategy="BSEJ", verbose=0): super().__init__(seed=seed, strategy=strategy, verbose=verbose, cv=None) def _generate_neighbors_bsej(self, current_columns, X): if X.shape[1] > 1: for column_to_drop in range(X.shape[1]): new_columns = current_columns.copy() del new_columns[column_to_drop] yield new_columns, column_to_drop, None, True # Updated column list, columns to remove, columns to add, delete? for features in combinations(np.arange(X.shape[1]), 2): new_col_name = flatten([ current_columns[features[0]], current_columns[features[1]] ]) new_columns = current_columns.copy() new_columns.append(tuple(new_col_name)) columns_to_drop = sorted(features, reverse=True) del new_columns[columns_to_drop[0]] del new_columns[columns_to_drop[1]] combined_columns = combine_columns(X, list(features)) yield new_columns, list( columns_to_drop), combined_columns, False def fit_bsej(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = X.copy() current_columns = deque(range(X.shape[1])) best_score = self.evaluate(self.classifier, current_best, y, columns=current_columns, fit=True) stop = False while not stop: update = False stop = True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score) for new_columns, columns_to_delete, columns_to_add, delete in self._generate_neighbors_bsej( current_columns, current_best): if delete: action = "DELETE" # Update classifier and get validation result self.classifier.remove_feature(columns_to_delete) neighbor = np.delete(current_best, columns_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration self.classifier.add_features( current_best[:, columns_to_delete].reshape(-1, 1), y, index=[columns_to_delete]) else: action = "ADD" self.classifier.add_features(columns_to_add, y) self.classifier.remove_feature(columns_to_delete[0]) self.classifier.remove_feature(columns_to_delete[1]) neighbor = np.delete(current_best, columns_to_delete, axis=1) neighbor = np.concatenate([neighbor, columns_to_add], axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if self.classifier.n_features_ == 1: # We reverse it for insert order self.classifier.add_features( current_best[:, columns_to_delete], y) self.classifier.remove_feature(0) else: self.classifier.remove_feature(neighbor.shape[1] - 1) # We reverse it for insert order self.classifier.add_features( current_best[:, columns_to_delete], y, index=columns_to_delete) if self.verbose == 2: print("\tNeighbor: ", new_columns, " Score: ", score) if score > best_score: stop = False best_columns = new_columns best_action = action best_score = score best_columns_to_delete = columns_to_delete update = True if best_action == "ADD": best_columns_to_add = columns_to_add if score == 1.0: stop = True break if update: current_columns = best_columns if best_action == "DELETE": current_best = np.delete(current_best, best_columns_to_delete, axis=1) # Update best self.classifier.remove_feature(best_columns_to_delete) else: current_best = np.delete(current_best, best_columns_to_delete, axis=1) current_best = np.concatenate( [current_best, best_columns_to_add], axis=1) # Update classifier self.classifier.add_features(best_columns_to_add, y) self.classifier.remove_feature(best_columns_to_delete[0]) self.classifier.remove_feature(best_columns_to_delete[1]) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self def _generate_neighbors_fssj(self, current_columns, individual, original_data, available_columns): if available_columns: for index, col in enumerate(available_columns): new_columns = current_columns.copy() new_columns.append(col) new_available_columns = available_columns.copy() del new_available_columns[index] column_to_add = original_data[:, col].reshape(-1, 1) column_to_delete = None # New columns, Availables,ColumnToDelete,ColumnToAdd,Delete? yield new_columns, new_available_columns, column_to_delete, column_to_add, False if individual is not None and individual.shape[ 1] > 0 and available_columns: for features_index in product(np.arange(len(available_columns)), np.arange(len(current_columns))): features = available_columns[ features_index[0]], current_columns[features_index[1]] new_col_name = flatten([features[0], features[1]]) new_available_columns = available_columns.copy() del new_available_columns[features_index[0]] new_columns = current_columns.copy() new_columns.append(tuple(new_col_name)) del new_columns[features_index[1]] separated_columns = np.concatenate([ original_data[:, features[0]].reshape(-1, 1), individual[:, features_index[1]].reshape(-1, 1) ], axis=1) if isinstance(features[1], tuple): features = list(features) features[1] = list(features[1]) features = tuple(features) column_to_delete = features_index[1] combined_columns = combine_columns(separated_columns) column_to_add = combined_columns yield new_columns, new_available_columns, column_to_delete, column_to_add, True def fit_fssj(self, X, y): self.evaluate = memoize(_evaluate, attribute_to_cache="columns") current_best = None current_columns = deque() available_columns = list(range(X.shape[1])) best_score = -float("inf") stop = False while not stop: update = False stop = True # self.classifier.encode_data=True if self.verbose: print("Current Best: ", current_columns, " Score: ", best_score, "Available columns: ", available_columns) for new_columns, new_available_columns, column_to_delete, column_to_add, delete in self._generate_neighbors_fssj( current_columns=current_columns, individual=current_best, original_data=X, available_columns=available_columns): if delete: action = "JOIN" # Update classifier and get validation result self.classifier.add_features(column_to_add, y) self.classifier.remove_feature(column_to_delete) neighbor = np.concatenate([current_best, column_to_add], axis=1) neighbor = np.delete(neighbor, column_to_delete, axis=1) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) # Restore the column for the next iteration if neighbor.shape[1] == 1: self.classifier.fit(current_best, y) else: self.classifier.remove_feature(neighbor.shape[1] - 1) self.classifier.add_features( current_best[:, column_to_delete].reshape(-1, 1), y, index=[column_to_delete]) else: action = "ADD" if current_best is None: neighbor = column_to_add self.classifier.fit(neighbor, y) else: neighbor = np.concatenate( [current_best, column_to_add], axis=1) self.classifier.add_features(column_to_add, y) score = self.evaluate(self.classifier, neighbor, y, columns=new_columns, fit=False) if current_best is None: self.classifier = NaiveBayes(encode_data=True) else: self.classifier.remove_feature(neighbor.shape[1] - 1) if self.verbose == 2: print("\tNeighbour: ", new_columns, " Score: ", score, "Available columns: ", new_available_columns) if score > best_score: stop = False best_columns = new_columns best_available_columns = new_available_columns best_action = action best_score = score best_column_to_delete = column_to_delete best_column_to_add = column_to_add update = True if score == 1.0: stop = True break if update: current_columns = best_columns available_columns = best_available_columns if best_action == "JOIN": self.classifier.add_features(best_column_to_add, y) self.classifier.remove_feature(best_column_to_delete) current_best = np.concatenate( [current_best, best_column_to_add], axis=1) current_best = np.delete(current_best, best_column_to_delete, axis=1) else: if current_best is None: current_best = best_column_to_add self.classifier.fit(current_best, y) else: current_best = np.concatenate( [current_best, best_column_to_add], axis=1) self.classifier.add_features(best_column_to_add, y) if self.verbose: print("Final best: ", list(current_columns), " Score: ", best_score) self.features_ = current_columns self.feature_transformer = lambda X: join_columns( X, columns=self.features_) model = self.classifier.fit(self.feature_transformer(X), y) return self def evaluate(self, classifier, X, y, fit=True, columns=None): return _evaluate(classifier, X, y, fit=True, columns=None)