def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1): if not X: X = np.array([ ["P", "+"], ["P2", "-"], ["P3", "-"], ]) custom_encoder = CustomOrdinalFeatureEncoder() ordinal_encoder = OrdinalEncoder() ordinal_encoder_time = [] custom_encoder_time = [] for i in range(iterations): ts = time() custom_encoder.fit(X) transformed = custom_encoder.transform(X) custom_encoder.inverse_transform(transformed) custom_encoder_time.append(time() - ts) ts = time() ordinal_encoder.fit(X) transformed = ordinal_encoder.transform(X) ordinal_encoder.inverse_transform(transformed) ordinal_encoder_time.append(time() - ts) custom_encoder_time = np.mean(custom_encoder_time) ordinal_encoder_time = np.mean(ordinal_encoder_time) if verbose: print(f"CustomEncoder -> Time: {custom_encoder_time}") print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}") return custom_encoder_time, ordinal_encoder_time
def acfs_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True): # List to store results and column names for the csv result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes", "Contruction Matrix", "Selection Matrix", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier acfcs = ACFCS(verbose=0, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) # Update progressbar dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selection_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_construction_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) acfcs.reset_cache() for conf_index, conf in enumerate(params): acfcs.set_params(**conf) acfcs.fit(X_train, y_train, init_graph=conf_index == 0) # score acfcs_score_conf = acfcs.score(X_test, y_test) if verbose: seed_tqdm.set_postfix({ "config": conf_index, "nb_score": naive_bayes_score, "ant_score": acfcs_score_conf }) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), acfcs.best_features))) n_selected = len(acfcs.best_features) selection_matrix = len(acfcs.afg.pheromone_selection) construction_matrix = len(acfcs.afg.pheromone_construction) nodes = len(acfcs.afg.nodes) # Update nb_score[conf_index, i] = naive_bayes_score acfcs_score[conf_index, i] = acfcs_score_conf acfcs_selection_matrix[conf_index, i] = selection_matrix acfcs_construction_matrix[conf_index, i] = construction_matrix acfcs_nodes[conf_index, i] = nodes acfcs_dummy[conf_index, i] = n_original_features acfcs_selected[conf_index, i] = n_selected # Insert the final result - averaged metrics for this database. for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(acfcs_score[conf_index]), np.std(acfcs_score[conf_index]), conf, np.mean(acfcs_nodes[conf_index]), np.mean(acfcs_construction_matrix[conf_index]), np.mean(acfcs_selection_matrix[conf_index]), np.mean(acfcs_selected[conf_index]), np.mean(acfcs_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("ACFCS", email_data, result) return result
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin, BaseEstimator): """First proposal: Hybrid-Ranker Wrapper. Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1 (1 operator, 2 operands), using XOR, AND and OR operator. The steps are: - Find out combinations of values in database of every pair of features Xi, Xj: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a'),(2,'b'),(3,'c'),(2,'a')] - Apply operator to every combination: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'), (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'), (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')] - Add original variables to the list - Evaluate SU for every value in the list, and rank them - Go over the list following one of the two strategies proposed and evaluate the subset based on a leave-one-out cross-validation with the NaiveBayes classifier. Parameters ---------- strategy : str {eager,skip} After the ranking is built if the eager strategy is chosen we stop considering attributes when there is no improvement from one iteration to the next block_size : int, default=1 Number of features that are added in each iteration encode_data : boolean Whether or not to encode the received data. If set to false the classifier expects data to be encoded with an ordinal encoder. verbose : {boolean,int} If set to true it displays information of the remaining time and inside variables. operators : array-like, deafult = ("XOR","AND","OR") Operators used for the constructed features. max_features : int, deafult = inf Maximum number of features to include in the selected subset max_iterations : int, deafult = inf Maximum number of iterations in the wrapper step. use_graph : bool, default = False Generate Ranking from features obtained from the pruned-graph of the ACO algorithm. (Experimentation not carried out) use_initials: bool, default = False Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand. Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. all_feature_constructors: array-like List of FeatureConstructor objects with all the possible logical features symmetrical_uncertainty_rank: array-like SU for every feature in all_feature_constructors rank : array-like Array of indexes corresponding to the sorted SU rank (in descending order). final_feature_constructors: Selected feature subset (list of constructors) classifier: NaiveBayes Classifier used in the wrapper and to perform predictions after fitting. """ def __init__(self, strategy="eager", block_size=10, encode_data=True, n_intervals=5, verbose=0, operators=("AND", "OR", "XOR"), max_features=float("inf"), max_iterations=float("inf"), metric="accuracy", use_initials=False, max_err=0, prune=None, use_graph=False): self.strategy = strategy self.block_size = max(block_size, 1) self.encode_data = encode_data self.verbose = verbose self.operators = operators self.max_features = max_features self.max_iterations = max_iterations self.n_intervals = n_intervals self.metric = metric self.max_err = max_err self.use_initials = use_initials self.prune = prune self.use_graph = use_graph allowed_strategies = ("eager", "skip") if self.strategy not in allowed_strategies: raise ValueError("Unknown operator type: %s, expected one of %s." % (self.strategy, allowed_strategies)) def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self def predict(self, X): X, _ = self.transform(X) if self.encode_data: return self.class_encoder_.inverse_transform( self.classifier.predict(X)) return self.classifier.predict(X) def reset_evaluation(self): # Reset the memoize evaluations self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out) def predict_proba(self, X): X, _ = self.transform(X) return self.classifier.predict_proba(X) def score(self, X, y): X, y = self.transform(X, y) return self.classifier.score(X, y) def filter_features(self, X, y): '''After the rank is built this perform the greedy wrapper search''' check_is_fitted(self) self.classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) current_score = np.NINF first_iteration = True current_features = [] current_data = None if self.use_initials: # Original Features have already been taken into account rank_iter = filter( lambda x: not isinstance(self.all_feature_constructors[x], DummyFeatureConstructor), iter(self.rank)) # Deep copy to avoid issues when modifying the list current_features = deepcopy(self.initial_backward_features) current_data = np.concatenate( [f.transform(X) for f in current_features], axis=1) # Get initial LOO score current_score = self.evaluate_leave_one_out_cross_val( self.classifier, current_features, current_data, y, fit=True) else: # Iterator over the sorted list of indexes rank_iter = iter(self.rank) if self.verbose: progress_bar = tqdm(total=len(self.rank), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') iteration = 0 iterations_without_improvements = 0 # Loop for including {block size} elements at a time # Rank is an iterator, so the for loop is not sequential! for feature_constructor_index in rank_iter: iteration += 1 if self.verbose: progress_bar.set_postfix({ "n_features": len(current_features), "score": current_score }) progress_bar.update(1) progress_bar.refresh() # Add block size features new_X = [ self.all_feature_constructors[feature_constructor_index]. transform(X) ] selected_features = [ self.all_feature_constructors[feature_constructor_index] ] for _ in range(self.block_size - 1): try: index = next(rank_iter) selected_features.append( self.all_feature_constructors[index]) new_X.append( self.all_feature_constructors[index].transform(X)) if self.verbose: progress_bar.update(1) progress_bar.refresh() except: # Block size does not divide the number of elements in the rank. The search is halted break # Evaluate features new_X = np.concatenate(new_X, axis=1) if iteration == 1 and not self.use_initials: current_data = new_X current_score = self.evaluate_leave_one_out_cross_val( self.classifier, selected_features, current_data, y, fit=True) current_features = selected_features first_iteration = False if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break continue data = np.concatenate([current_data, new_X], axis=1) self.classifier.add_features(new_X, y) # LOO evaluation score = self.evaluate_leave_one_out_cross_val(self.classifier, current_features + selected_features, data, y, fit=False) if score > current_score: current_score = score current_data = data current_features.extend(selected_features) iterations_without_improvements = 0 else: iterations_without_improvements += 1 # Remove last added block for feature_index_to_remove in range( data.shape[1], data.shape[1] - new_X.shape[1], -1): self.classifier.remove_feature(feature_index_to_remove - 1) if self.strategy == "eager" and self.max_err < iterations_without_improvements: # Stops as soon as no impovement break if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break if self.verbose: progress_bar.close() print( f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}" ) self.final_feature_constructors = current_features return self def transform(self, X, y=None): check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: X = self.feature_encoder_.transform(X) if y is not None: y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() new_X = [] for feature_constructor in self.final_feature_constructors: new_X.append(feature_constructor.transform(X)) return np.concatenate(new_X, axis=1), y
def ranker_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), share_rank=True): result = [] columns = ["Database", "Number of attributes", "NBScore", "NBScore STD", "Ranker Score", "Ranker Score STD", "Configuration", "Combinations", "Selected_attributes", "Original"] dataset_tqdm = tqdm(datasets) # Instantiate the classifier r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats)) r_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits*n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers nb.fit(X=X_train, y=y_train) naive_bayes_score = nb.score(X_test, y_test) for conf_index, conf in enumerate(params): seed_tqdm.set_postfix({"config": conf_index}) r.set_params(**conf) # Fit if conf_index == 0 or not share_rank: # The rank is computed from scratch r.fit(X_train, y_train) else: r.filter_features(r.feature_encoder_.transform( X_train), r.class_encoder_.transform(y_train)) # score ranker_score = r.score(X_test, y_test) # Get data n_original_features = len(list(filter(lambda x: isinstance( x, DummyFeatureConstructor), r.final_feature_constructors))) n_combinations = len(r.all_feature_constructors) n_selected = len(r.final_feature_constructors) # Update nb_score[conf_index, i] = naive_bayes_score r_score[conf_index, i] = ranker_score r_combinations[conf_index, i] = n_combinations r_selected[conf_index, i] = n_selected r_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this dataset for conf_index, conf in enumerate(params): row = [name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(r_score[conf_index]), np.std(r_score[conf_index]), conf, np.mean(r_combinations[conf_index]), np.mean(r_selected[conf_index]), np.mean(r_dummy[conf_index])] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("RANKER", email_data, result) return result
def genetic_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True, version=1): result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "Genetic Score", "Genetic Score STD", "Configuration", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier if version == 1: # First Version - No flexibility in the number of attributes (bad performance) # clf = GeneticProgramming(seed=seed, metric=metric) clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) elif version == 2: # Version with flexibility clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) else: # Guided mutation based on SU clf = GeneticProgrammingRankMutation(seed=seed, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) # Reset evaluation-cache for new split clf.reset_evaluation() for conf_index, conf in enumerate(params): if verbose: seed_tqdm.set_postfix({"config": conf_index}) clf.set_params(**conf) clf.fit(X_train, y_train) # score genetic_score = clf.score(X_test, y_test) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), clf.best_features))) n_selected = len(clf.best_features) # Update nb_score[conf_index, i] = naive_bayes_score clf_score[conf_index, i] = genetic_score clf_selected[conf_index, i] = n_selected clf_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this database for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(clf_score[conf_index]), np.std(clf_score[conf_index]), conf, np.mean(clf_selected[conf_index]), np.mean(clf_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results(f"GENETIC_{version}", email_data, result) return result
class NaiveBayes(ClassifierMixin, BaseEstimator): """A Naive Bayes classifier. Simple NaiveBayes classifier accepting non-encoded input, enhanced with numba using MAP to predict most likely class. Parameters ---------- alpha : {float, array-like}, default=1.0 Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). If it is an array it is expected to have the same size as number of attributes encode_data : bool, default=True Encode data when data is not encoded by default with an OrdinalEncoder discretize : bool, default=True Discretize numerical data n_intervals : int or None, default=5 Discretize numerical data using the specified number of intervals Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. n_samples_ : int Number of samples n_features_ : int Number of features n_classes_ : int Number of classes class_values_ : array-like of shape (n_classes_,) Array containing the values of the classes, as ordinal encoding is assumed it will be an array ranging from 0 to largest value for the class class_count_ : array-like of shape (n_classes_,) Array where `class_count_[i]` contains the count of the ith class value. class_log_count_ : array-like of shape (n_classes_,) Array where `class_count_[i]` contains the log count of the ith class value. feature_values_count_per_element_ : array-like of shape (column_count,~) Array where `feature_values_count_per_element_[i]` is an array where `feature_values_count_per_element_[i][j]` contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0 feature_values_count_ : array-like of shape (column_count,) Array where `feature_values_count_per_element_[i]` is an integer with the number of possible values for the ith feature. feature_unique_values_count_ : array-like of shape (column_count,) Array where `feature_unique_values_count_[i]` is an integer with the number of unique seen values for the ith feature at fitting time. This is needed to compute the smoothing. total_probability_ : array-like of shape (n_classes,) Smoothing factor to be applied to the prediction. Array where `total_probability_[i]` if equal to class_count_[i] + alpha*feature_unique_values_count_ self.indepent_term_ : array-like of shape (n_classes,) Independent term computed at fitting time. It includes the smoothing factor to be applied to the prediction and the apriori probability. probabilities_ : array-like of shape (column_count,~) Array where `feature_values_count_per_element_[i]` is an array of shape (where `feature_values_count_per_element_[i][j]` contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0 """ def __init__(self, alpha=1.0, encode_data=True, n_intervals=5, discretize=True, metric="accuracy"): self.alpha = alpha self.encode_data = encode_data self.n_intervals = n_intervals self.discretize = discretize self.metric = metric self._get_scorer() super().__init__() def _get_scorer(self): self.scorer = get_scorer(self.metric) if self.metric == "f1_score": # Unseen values for target class may cause errors self.scorer = lambda y_true, y_pred: get_scorer(self.metric)( y_true=y_true, y_pred=y_pred, average="macro", zero_division=0) def set_params(self, **params): super().set_params(**params) self._get_scorer() def _compute_independent_terms(self): """Computes the terms that are indepent of the prediction""" self.total_probability_ = compute_total_probability_( self.class_count_, self.feature_unique_values_count_, self.alpha) # self.total_probability_ = compute_total_probability_(self.class_count_,self.feature_values_count_,self.alpha) #-->scikit uses this self.indepent_term_ = self.class_log_count_smoothed_ - self.total_probability_ def _compute_class_counts(self, X: np.ndarray, y: np.ndarray): """Computes the counts for the priors""" self.n_classes_ = 1 + np.max(y) self.class_count_ = np.bincount(y) self.class_log_count_ = np.log(self.class_count_) self.class_count_smoothed_ = (self.class_count_ + self.alpha) self.class_log_count_smoothed_ = np.log(self.class_count_smoothed_) def _compute_feature_counts(self, X: np.ndarray, y: np.ndarray): """Computes the conditional smoothed counts for each feature""" tables = _get_tables(X, y, self.n_classes_, self.alpha) self.smoothed_counts_ = tables[0] self.smoothed_log_counts_ = tables[1] self.feature_values_count_ = tables[2] self.feature_values_count_per_element_ = tables[3] self.feature_unique_values_count_ = tables[4] def fit(self, X: np.ndarray, y: np.ndarray): """ Fits the classifier with trainning data. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. Returns ------- self : object """ if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals, discretize=self.discretize) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) if X.dtype != int: X = X.astype(int) if y.dtype != int: y = y.astype(int) self.n_samples_, self.n_features_ = X.shape self._compute_class_counts(X, y) self._compute_feature_counts(X, y) self._compute_independent_terms() return self def predict(self, X: np.ndarray): """ Predicts the label of the samples based on the MAP. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True Returns ------- y : array-like of shape (n_samples) Predicted label for each sample. """ check_is_fitted(self) if self.encode_data: X = self.feature_encoder_.transform(X) if isinstance(X, pd.DataFrame): X = X.to_numpy() if X.dtype != int: X = X.astype(int) check_array(X) log_probabilities = _predict(X, self.smoothed_log_counts_, self.feature_values_count_, self.alpha) log_probabilities += self.indepent_term_ output = np.argmax(log_probabilities, axis=1) if self.encode_data: output = self.class_encoder_.inverse_transform(output) return output def predict_proba(self, X: np.ndarray): """ Predicts the probability for each label of the samples based on the MAP. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True Returns ------- y : array-like of shape (n_classes,n_samples) Array where `y[i][j]` contains the MAP of the jth class for ith sample """ check_is_fitted(self) if X.dtype != int: X = X.astype(int) if self.encode_data: X = self.feature_encoder_.transform(X) if isinstance(X, pd.DataFrame): X = X.to_numpy() log_probabilities = _predict(X, self.smoothed_log_counts_, self.feature_values_count_, self.alpha) log_probabilities += self.indepent_term_ log_prob_x = logsumexp(log_probabilities, axis=1) return np.exp(log_probabilities - np.atleast_2d(log_prob_x).T) def leave_one_out_cross_val(self, X, y, fit=True): """Efficient LOO computation""" if fit: self.fit(X, y) if self.encode_data: X = self.feature_encoder_.transform(X) y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() if X.dtype != int: X = X.astype(int) if y.dtype != int: X = X.astype(int) log_alpha = np.log(self.alpha) log_proba = np.zeros((X.shape[0], self.n_classes_)) for i in range(X.shape[0]): example, label = X[i], y[i] class_count_ = self.class_count_.copy() class_count_[label] -= 1 log_proba[i] = np.log(class_count_ + self.alpha) for j in range(X.shape[1]): p = self.smoothed_log_counts_[j][example[j]].copy() p[label] = np.log( np.max([ self.smoothed_counts_[j][example[j]][label] - 1, self.alpha ])) log_proba[i] += p if self.feature_values_count_per_element_[j][example[j]] == 1: update_value = np.log(class_count_ + ( self.feature_unique_values_count_[j] - 1) * self.alpha) else: update_value = np.log( class_count_ + (self.feature_unique_values_count_[j]) * self.alpha) log_proba[i] -= np.where(update_value == np.NINF, 0, update_value) y_pred = np.argmax(log_proba, axis=1) return self.scorer(y, y_pred) def add_features(self, X, y, index=None): """Updates classifier with new features Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. index: {None,array-like of shape (X.shape[1])} Indicates where to insert each new feature, if it is None they are all appended at the very end. Returns ------- self : object """ check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: # y should be the same than the one that was first fitted for now ----> FUTURE IMPLEMENTATION y = self.class_encoder_.transform(y) X = self.feature_encoder_.add_features(X, transform=True, index=index) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) if X.dtype != int: X = X.astype(int) if y.dtype != int: X = X.astype(int) self.n_features_ += X.shape[1] tables = _get_tables(X, y, self.n_classes_, self.alpha) new_smoothed_counts = tables[0] new_smoothed_log_counts = tables[1] new_feature_value_counts = tables[2] new_feature_value_counts_per_element = tables[3] new_feature_unique_values_count_ = tables[4] new_feature_contribution = compute_total_probability_( self.class_count_, new_feature_unique_values_count_, self.alpha) if index: sort_index = np.argsort(index) index_with_column = list(enumerate(index)) for i in sort_index: column, list_insert_index = index_with_column[i] self.feature_values_count_per_element_.insert( list_insert_index, new_feature_value_counts_per_element[column]) self.feature_values_count_ = np.insert( self.feature_values_count_, list_insert_index, new_feature_value_counts[column]) self.smoothed_counts_.insert(list_insert_index, new_smoothed_counts[column]) self.smoothed_log_counts_.insert( list_insert_index, new_smoothed_log_counts[column]) self.feature_unique_values_count_ = np.insert( self.feature_unique_values_count_, list_insert_index, new_feature_unique_values_count_[column]) else: self.feature_values_count_per_element_.extend( new_feature_value_counts_per_element) self.feature_values_count_ = np.concatenate( [self.feature_values_count_, new_feature_value_counts]) self.smoothed_counts_.extend(new_smoothed_counts) self.smoothed_log_counts_.extend(new_smoothed_log_counts) self.feature_unique_values_count_ = np.concatenate([ self.feature_unique_values_count_, new_feature_unique_values_count_ ]) self.total_probability_ += new_feature_contribution self.indepent_term_ -= new_feature_contribution return self def remove_feature(self, index): """Updates classifierby removing one feature (index)""" check_is_fitted(self) if self.n_features_ <= 1: raise Exception("Cannot remove only feature from classifier") if not 0 <= index < self.n_features_: raise Exception( f"Feature index not valid, expected index between 0 and {self.n_features_}" ) self.n_features_ -= 1 feature_contribution = self.class_count_ + self.alpha * self.feature_unique_values_count_[ index] feature_contribution = np.log(feature_contribution) self.total_probability_ -= feature_contribution self.indepent_term_ += feature_contribution self.feature_unique_values_count_ = np.delete( self.feature_unique_values_count_, index) self.feature_values_count_ = np.delete(self.feature_values_count_, index) del self.feature_values_count_per_element_[index] del self.smoothed_counts_[index] del self.smoothed_log_counts_[index] if self.encode_data: self.feature_encoder_.remove_feature(index) return self def score(self, X: np.ndarray, y: np.ndarray): """Computes the accuracy Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. Returns ------- score : float Percentage of correctly classified instances """ y_pred = self.predict(X) return self.scorer(y, y_pred)