def fit(self, X, y): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) classifier_ = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) self.n_features = X.shape[1] if self.encode_data: self.unique_values = [ values.shape[0] for values in self.feature_encoder_.categories_ ] else: self.unique_values = [ np.unique(X[:, j]).shape[0] for j in range(X.shape[1]) ] random.seed(self.seed) np.random.seed(self.seed) self.size = np.ceil(np.sqrt(X.shape[1])) best_individual = self.execute_algorithm(X, y) self.best_features = best_individual self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) return self
def test_incremental_validation(X=None, y=None, iterations=10, verbose=1): if not X: X, y = make_classification(n_samples=500, n_features=1000, n_informative=20, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, class_sep=1, hypercube=False, scale=1.0, shuffle=True, random_state=0) X //= 10 # --> To be able to evaluate categoricalNB # classifiers nb_classifier = NaiveBayes(encode_data=True) nb_classifier_no_encoding = NaiveBayes(encode_data=False) custom_encoder = CustomOrdinalFeatureEncoder() cnb = CategoricalNB() # accumulators categorical_nb = [] custom_nb_val_1 = [] custom_nb_val_2 = [] custom_nb_val_3 = [] custom_nb_val_4 = [] for i in range(iterations): if verbose: print(f"Iteration {i}") ts = time() X2 = custom_encoder.fit_transform(X) ts = time() score_2 = nb_classifier.leave_one_out_cross_val(X, y) custom_nb_val_1.append(time() - ts) ts = time() score_4 = cross_leave_one_out(nb_classifier, X, y) custom_nb_val_3.append(time() - ts) ts = time() X2 = custom_encoder.fit_transform(X) score_5 = cross_leave_one_out(nb_classifier_no_encoding, X2, y) custom_nb_val_4.append(time() - ts) if i == 0: score_1 = score_2 scores = [score_1, score_2, score_4, score_5] assert all(score == scores[0] for score in scores) print("Categorical with scikit loo: ", np.mean(categorical_nb[1:])) print("Custom with scikit loo: ", np.mean(custom_nb_val_3[1:])) print("Custom with scikit loo (pre-encoding): ", np.mean(custom_nb_val_4[1:])) print("Custom with first incremental: ", np.mean(custom_nb_val_1[1:]))
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1): if not X: X = np.array([ ["P", "+"], ["P2", "-"], ["P3", "-"], ]) custom_encoder = CustomOrdinalFeatureEncoder() ordinal_encoder = OrdinalEncoder() ordinal_encoder_time = [] custom_encoder_time = [] for i in range(iterations): ts = time() custom_encoder.fit(X) transformed = custom_encoder.transform(X) custom_encoder.inverse_transform(transformed) custom_encoder_time.append(time() - ts) ts = time() ordinal_encoder.fit(X) transformed = ordinal_encoder.transform(X) ordinal_encoder.inverse_transform(transformed) ordinal_encoder_time.append(time() - ts) custom_encoder_time = np.mean(custom_encoder_time) ordinal_encoder_time = np.mean(ordinal_encoder_time) if verbose: print(f"CustomEncoder -> Time: {custom_encoder_time}") print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}") return custom_encoder_time, ordinal_encoder_time
def fit(self, X: np.ndarray, y: np.ndarray): """ Fits the classifier with trainning data. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. Returns ------- self : object """ if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals, discretize=self.discretize) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) if X.dtype != int: X = X.astype(int) if y.dtype != int: y = y.astype(int) self.n_samples_, self.n_features_ = X.shape self._compute_class_counts(X, y) self._compute_feature_counts(X, y) self._compute_independent_terms() return self
class GeneticProgrammingFlexibleLogic(OptimizationMixin, TransformerMixin, ClassifierMixin, BaseEstimator): """GeneticProgramming for Feature Construction and Selection. Parameters ---------- seed : int or None Seed to guarantee reproducibility individuals : int Number of individuals per population generations : int Number of generations mutation_probability : float Probability for each individual of being mutated select : {rank,proportionate} Selection strategy mutation : {simple,complex} Mutation strategy combine : {truncation,elitism} Population combination strategy n_intervals : int Number of intervals for the discretization of continous variables mixed : bool Mix heuristic and wrapper evaluation mixed_percentage : float Percentage of total iterations to do heuristic evaluation metric : {accuracy,f1-score} Target metric for the optimization process flexible_logic: bool Allow different individual sizes in the generation encode_data : bool, default=True Encode data when data is not encoded by default with an OrdinalEncoder verbose :int {0,1}, default = 1 Display process progress Attributes ---------- classifier_ : NaiveBayes Base classifier used for prediction best_features_ : array-lik of Feature Array of selected Feature used for transforming new data """ def simple_evaluate(self, individual, X, y): classifier_ = NaiveBayes(encode_data=False, metric=self.metric) return classifier_.leave_one_out_cross_val(transform_features( individual[0] + individual[1], X), y, fit=True) def simple_evaluate_heuristic(self, individual, X, y): return compute_sufs_non_incremental( features=[f.transform(X) for f in chain(*individual[:2])], y=y) def fitness(self, population, X, y): evaluation = [] for individual in population: evaluation.append((individual, self.evaluate(individual, X, y))) return evaluation def generate_population(self): population = [] for _ in range(self.individuals): individual = ([], [], set()) if self.flexible_logic: n_chromosomes = range(random.randint(1, self.size)) else: n_chromosomes = range(self.size) for _ in n_chromosomes: operand1_feature = random.randint(0, self.n_features - 1) operand2_feature = random.randint(0, self.n_features - 1) if operand1_feature == operand2_feature: op = 'OR' operand1_value = random.randint( 0, self.unique_values[operand1_feature] - 1) operand2_value = random.randint( 0, self.unique_values[operand1_feature] - 1) else: op = random.choice(('OR', 'XOR', 'AND')) operand1_value = random.randint( 0, self.unique_values[operand1_feature] - 1) operand2_value = random.randint( 0, self.unique_values[operand2_feature] - 1) operands = [] operands.append((operand1_feature, operand1_value)) operands.append((operand2_feature, operand2_value)) individual[1].append( create_feature(operator=op, operands=operands)) n_og_features = random.randint(0, self.n_features - 1) features = list(range(self.n_features)) for f in random.sample(features, n_og_features): individual[0].append(DummyFeatureConstructor(feature_index=f)) individual[2].add(f) population.append(individual) return population def mutate_complex(self, population, **kwargs): new_population = [] for individual in population: if random.random() < self.mutation_probability: chromosomes_index = [] if self.flexible_logic: if len(individual[1]) > 0: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) else: op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) new_population.append(individual) continue else: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) for i in range(len(chromosomes_index)): index = chromosomes_index[i] if not self.flexible_logic: feature = individual[1][index] feature.op = random.choice(('OR', 'XOR', 'AND')) for operand in feature.operands: operand.feature_index = random.randint( 0, self.n_features - 1) operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) else: a = random.random() if a < 0.33: feature = individual[1][index] feature.op = random.choice(('OR', 'XOR', 'AND')) for operand in feature.operands: operand.feature_index = random.randint( 0, self.n_features - 1) operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif a < 0.66: op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) else: del individual[1][index] chromosomes_index = [ j - 1 if j > index else j for j in chromosomes_index ] if random.random() < self.mutation_probability: a = random.random() og_features = individual[0] included_features = individual[2] if (a < 0.33 and len(og_features) < self.n_features ) or len(og_features) == 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) elif a < 0.66 and len(og_features) < self.n_features and len( og_features) > 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index og_features[index] = DummyFeatureConstructor(selected) included_features.remove(feature) included_features.add(selected) else: index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index del og_features[index] included_features.remove(feature) if len(individual[0]) == 0 and len(individual[1]) == 0: og_features = individual[0] included_features = individual[2] selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) new_population.append(individual) return new_population def mutate_simple(self, population, **kwargs): new_population = [] for individual in population: if random.random() < self.mutation_probability: chromosomes_index = [] if self.flexible_logic: if len(individual[1]) > 0: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) else: op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) new_population.append(individual) continue else: chromosomes_index = random.sample( list(range(len(individual[1]))), random.randint(1, len(individual[1]))) for i in range(len(chromosomes_index)): index = chromosomes_index[i] feature = individual[1][index] if not self.flexible_logic: feature.op = random.choice(('OR', 'XOR', 'AND')) for operand in feature.operands: operand.feature_index = random.randint( 0, self.n_features - 1) operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) else: a = random.random() if a < 0.33: b = random.random() if b < 0.2: # Change operatior feature.op = random.choice( ('OR', 'XOR', 'AND')) elif b < 0.4: # Change full operand operand = feature.operands[0] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif b < 0.6: # Change full operand operand = feature.operands[1] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif b < 0.8: # Change value operand = feature.operands[0] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) else: # Change value operand = feature.operands[1] operand.value = random.randint( 0, self.unique_values[operand.feature_index] - 1) elif a < 0.66: # Add feature op = random.choice(('OR', 'XOR', 'AND')) operands = [] for _ in range(2): feature_index = random.randint( 0, self.n_features - 1) value = random.randint( 0, self.unique_values[feature_index] - 1) operands.append((feature_index, value)) individual[1].append( create_feature(operator=op, operands=operands)) else: # Remove feature del individual[1][index] chromosomes_index = [ j - 1 if j > index else j for j in chromosomes_index ] if random.random() < self.mutation_probability: a = random.random() og_features = individual[0] included_features = individual[2] if (a < 0.33 and len(og_features) < self.n_features ) or len(og_features) == 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) elif a < 0.66 and len(og_features) < self.n_features and len( og_features) > 0: selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index og_features[index] = DummyFeatureConstructor(selected) included_features.remove(feature) included_features.add(selected) else: index = random.randint(0, len(og_features) - 1) feature = og_features[index].feature_index del og_features[index] included_features.remove(feature) if len(individual[0]) == 0 and len(individual[1]) == 0: og_features = individual[0] included_features = individual[2] selected = random.choice( tuple( set(list(range(0, self.n_features))) - included_features)) included_features.add(selected) og_features.append(DummyFeatureConstructor(selected)) new_population.append(individual) return new_population def elitism(self, population1, population2): maximum = max(population1, key=lambda x: x[1]) minimum_index = min(enumerate(population2), key=lambda x: x[1][1])[0] population2[minimum_index] = maximum return population2 def truncation(self, population1, population2): return sorted(population1 + population2, reverse=True, key=lambda x: x[1])[:len(population1)] def select_population(self, population): selected_individuals = [] num_selected = len(population) totalFitness = sum(fitness for _, fitness in population) for _ in range(num_selected): cumulative_prob = 0.0 r = random.random() for individual_with_fitness in population: cumulative_prob += individual_with_fitness[1] / totalFitness if r <= cumulative_prob: selected_individuals.append( self.copy_individual(individual_with_fitness[0])) break return selected_individuals def select_population_rank(self, population): selected_individuals = [] num_selected = len(population) totalRank = (num_selected * (num_selected + 1)) / 2 population.sort(reverse=True, key=lambda x: x[1]) for _ in range(num_selected): cumulative_prob = 0.0 r = random.random() for i, individual_with_fitness in enumerate(population, start=1): cumulative_prob += (num_selected - i + 1) / totalRank if r <= cumulative_prob: selected_individuals.append( self.copy_individual(individual_with_fitness[0])) break return selected_individuals def copy_individual(self, individual): return ([chrms.copy() for chrms in individual[0]], [chrms.copy() for chrms in individual[1]], individual[2].copy()) def fit(self, X, y): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) classifier_ = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) self.n_features = X.shape[1] if self.encode_data: self.unique_values = [ values.shape[0] for values in self.feature_encoder_.categories_ ] else: self.unique_values = [ np.unique(X[:, j]).shape[0] for j in range(X.shape[1]) ] random.seed(self.seed) np.random.seed(self.seed) self.size = np.ceil(np.sqrt(X.shape[1])) best_individual = self.execute_algorithm(X, y) self.best_features = best_individual self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) return self def execute_algorithm(self, X, y): if self.mixed: self.evaluate = self.evaluate_heuristic else: self.evaluate = self.evaluate_wrapper population = self.generate_population() population_with_fitness = self.fitness(population, X, y) iterator = tqdm(range(self.generations), leave=False) if self.verbose else range( self.generations) for generation in iterator: if self.mixed and generation > int( self.generations * self.mixed_percentage ) and self.evaluate == self.evaluate_heuristic: self.evaluate = self.evaluate_wrapper # Reevaluate for fair combination population_with_fitness = self.fitness([ individual_with_fitness[0] for individual_with_fitness in population_with_fitness ], X, y) selected_individuals = self.selection(population_with_fitness) crossed_individuals = selected_individuals # self.crossover(selected_individuals) mutated_individuals = self.mutation(crossed_individuals, X=X, y=y) new_population = self.fitness(mutated_individuals, X, y) population_with_fitness = self.combine(population_with_fitness, new_population) # Obtaining population's statistics if self.verbose: best, mean = get_max_mean(population_with_fitness) iterator.set_postfix({ "Generation": generation, "hit_count": self.evaluate.hit_count, "populationLength": len(population_with_fitness), "best fitness": best, "mean fitness": mean }) best_individual = max(population_with_fitness, key=lambda x: x[1])[0] return best_individual[0] + best_individual[1] def reset_evaluation(self): self.evaluate_wrapper = memoize_genetic(self.simple_evaluate) self.evaluate_heuristic = memoize_genetic( self.simple_evaluate_heuristic) def set_params(self, **params): super().set_params(**params) if "selection" in params: if params["selection"] not in ("rank", "proportionate"): raise ValueError( "Unknown selection parameter expected one of : " + str(tuple(["rank", "proportionate"]))) self.selection = self.select_population_rank if "rank" in params[ "selection"] else self.select_population if "combine" in params: if params["combine"] not in ("elitism", "truncate"): raise ValueError( "Unknown selection parameter expected one of : " + str(tuple(["elitism", "truncate"]))) self.combine = self.elitism if "elit" in params[ "combine"] else self.truncation if "mutation" in params: if params["mutation"] not in ("complex", "simple"): raise ValueError( "Unknown selection parameter expected one of : " + str(tuple(["complex", "simple"]))) self.mutation = self.mutate_simple if "simple" == params[ "mutation"] else self.mutate_complex def __init__(self, seed=None, individuals=1, generations=40, mutation_probability=0.2, selection="rank", mutation="simple", combine="elitism", n_intervals=5, metric="accuracy", flexible_logic=True, verbose=False, encode_data=True, mixed=True, mixed_percentage=0.5): self.mixed_percentage = mixed_percentage self.mixed = mixed self.encode_data = encode_data self.flexible_logic = flexible_logic self.verbose = verbose self.n_intervals = n_intervals self.metric = metric self.seed = seed self.individuals = individuals self.generations = generations self.mutation_probability = mutation_probability self.selection = selection self.combine = combine self.mutation = mutation allowed_selection = ('rank', 'proportionate') allowed_combine = ('elitism', 'truncate') allowed_mutation = ('complex', 'simple') if self.selection not in allowed_selection: raise ValueError( "Unknown selection type: %s, expected one of %s." % (self.selection, selection)) if self.combine not in allowed_combine: raise ValueError("Unknown combine type: %s, expected one of %s." % (self.combine, combine)) if self.mutation not in allowed_mutation: raise ValueError( "Unknown selection type: %s, expected one of %s." % (self.mutation, mutation)) self.selection = self.select_population_rank if "rank" in selection else self.select_population self.combine = self.elitism if "elit" in combine else self.truncation self.mutation = self.mutate_simple if "simple" in mutation else self.mutate_complex self.reset_evaluation()
def acfs_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True): # List to store results and column names for the csv result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes", "Contruction Matrix", "Selection Matrix", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier acfcs = ACFCS(verbose=0, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) # Update progressbar dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selection_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_construction_matrix = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) acfcs.reset_cache() for conf_index, conf in enumerate(params): acfcs.set_params(**conf) acfcs.fit(X_train, y_train, init_graph=conf_index == 0) # score acfcs_score_conf = acfcs.score(X_test, y_test) if verbose: seed_tqdm.set_postfix({ "config": conf_index, "nb_score": naive_bayes_score, "ant_score": acfcs_score_conf }) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), acfcs.best_features))) n_selected = len(acfcs.best_features) selection_matrix = len(acfcs.afg.pheromone_selection) construction_matrix = len(acfcs.afg.pheromone_construction) nodes = len(acfcs.afg.nodes) # Update nb_score[conf_index, i] = naive_bayes_score acfcs_score[conf_index, i] = acfcs_score_conf acfcs_selection_matrix[conf_index, i] = selection_matrix acfcs_construction_matrix[conf_index, i] = construction_matrix acfcs_nodes[conf_index, i] = nodes acfcs_dummy[conf_index, i] = n_original_features acfcs_selected[conf_index, i] = n_selected # Insert the final result - averaged metrics for this database. for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(acfcs_score[conf_index]), np.std(acfcs_score[conf_index]), conf, np.mean(acfcs_nodes[conf_index]), np.mean(acfcs_construction_matrix[conf_index]), np.mean(acfcs_selection_matrix[conf_index]), np.mean(acfcs_selected[conf_index]), np.mean(acfcs_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("ACFCS", email_data, result) return result
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin, BaseEstimator): """First proposal: Hybrid-Ranker Wrapper. Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1 (1 operator, 2 operands), using XOR, AND and OR operator. The steps are: - Find out combinations of values in database of every pair of features Xi, Xj: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a'),(2,'b'),(3,'c'),(2,'a')] - Apply operator to every combination: - Example: - Xi = [1,2,3,2] - Xj = ['a','b','c','a'] Possible combinations: [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'), (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'), (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')] - Add original variables to the list - Evaluate SU for every value in the list, and rank them - Go over the list following one of the two strategies proposed and evaluate the subset based on a leave-one-out cross-validation with the NaiveBayes classifier. Parameters ---------- strategy : str {eager,skip} After the ranking is built if the eager strategy is chosen we stop considering attributes when there is no improvement from one iteration to the next block_size : int, default=1 Number of features that are added in each iteration encode_data : boolean Whether or not to encode the received data. If set to false the classifier expects data to be encoded with an ordinal encoder. verbose : {boolean,int} If set to true it displays information of the remaining time and inside variables. operators : array-like, deafult = ("XOR","AND","OR") Operators used for the constructed features. max_features : int, deafult = inf Maximum number of features to include in the selected subset max_iterations : int, deafult = inf Maximum number of iterations in the wrapper step. use_graph : bool, default = False Generate Ranking from features obtained from the pruned-graph of the ACO algorithm. (Experimentation not carried out) use_initials: bool, default = False Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand. Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. all_feature_constructors: array-like List of FeatureConstructor objects with all the possible logical features symmetrical_uncertainty_rank: array-like SU for every feature in all_feature_constructors rank : array-like Array of indexes corresponding to the sorted SU rank (in descending order). final_feature_constructors: Selected feature subset (list of constructors) classifier: NaiveBayes Classifier used in the wrapper and to perform predictions after fitting. """ def __init__(self, strategy="eager", block_size=10, encode_data=True, n_intervals=5, verbose=0, operators=("AND", "OR", "XOR"), max_features=float("inf"), max_iterations=float("inf"), metric="accuracy", use_initials=False, max_err=0, prune=None, use_graph=False): self.strategy = strategy self.block_size = max(block_size, 1) self.encode_data = encode_data self.verbose = verbose self.operators = operators self.max_features = max_features self.max_iterations = max_iterations self.n_intervals = n_intervals self.metric = metric self.max_err = max_err self.use_initials = use_initials self.prune = prune self.use_graph = use_graph allowed_strategies = ("eager", "skip") if self.strategy not in allowed_strategies: raise ValueError("Unknown operator type: %s, expected one of %s." % (self.strategy, allowed_strategies)) def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self def predict(self, X): X, _ = self.transform(X) if self.encode_data: return self.class_encoder_.inverse_transform( self.classifier.predict(X)) return self.classifier.predict(X) def reset_evaluation(self): # Reset the memoize evaluations self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out) def predict_proba(self, X): X, _ = self.transform(X) return self.classifier.predict_proba(X) def score(self, X, y): X, y = self.transform(X, y) return self.classifier.score(X, y) def filter_features(self, X, y): '''After the rank is built this perform the greedy wrapper search''' check_is_fitted(self) self.classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) current_score = np.NINF first_iteration = True current_features = [] current_data = None if self.use_initials: # Original Features have already been taken into account rank_iter = filter( lambda x: not isinstance(self.all_feature_constructors[x], DummyFeatureConstructor), iter(self.rank)) # Deep copy to avoid issues when modifying the list current_features = deepcopy(self.initial_backward_features) current_data = np.concatenate( [f.transform(X) for f in current_features], axis=1) # Get initial LOO score current_score = self.evaluate_leave_one_out_cross_val( self.classifier, current_features, current_data, y, fit=True) else: # Iterator over the sorted list of indexes rank_iter = iter(self.rank) if self.verbose: progress_bar = tqdm(total=len(self.rank), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') iteration = 0 iterations_without_improvements = 0 # Loop for including {block size} elements at a time # Rank is an iterator, so the for loop is not sequential! for feature_constructor_index in rank_iter: iteration += 1 if self.verbose: progress_bar.set_postfix({ "n_features": len(current_features), "score": current_score }) progress_bar.update(1) progress_bar.refresh() # Add block size features new_X = [ self.all_feature_constructors[feature_constructor_index]. transform(X) ] selected_features = [ self.all_feature_constructors[feature_constructor_index] ] for _ in range(self.block_size - 1): try: index = next(rank_iter) selected_features.append( self.all_feature_constructors[index]) new_X.append( self.all_feature_constructors[index].transform(X)) if self.verbose: progress_bar.update(1) progress_bar.refresh() except: # Block size does not divide the number of elements in the rank. The search is halted break # Evaluate features new_X = np.concatenate(new_X, axis=1) if iteration == 1 and not self.use_initials: current_data = new_X current_score = self.evaluate_leave_one_out_cross_val( self.classifier, selected_features, current_data, y, fit=True) current_features = selected_features first_iteration = False if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break continue data = np.concatenate([current_data, new_X], axis=1) self.classifier.add_features(new_X, y) # LOO evaluation score = self.evaluate_leave_one_out_cross_val(self.classifier, current_features + selected_features, data, y, fit=False) if score > current_score: current_score = score current_data = data current_features.extend(selected_features) iterations_without_improvements = 0 else: iterations_without_improvements += 1 # Remove last added block for feature_index_to_remove in range( data.shape[1], data.shape[1] - new_X.shape[1], -1): self.classifier.remove_feature(feature_index_to_remove - 1) if self.strategy == "eager" and self.max_err < iterations_without_improvements: # Stops as soon as no impovement break if self.max_iterations <= iteration or ( len(current_features) + self.block_size) > self.max_features: break if self.verbose: progress_bar.close() print( f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}" ) self.final_feature_constructors = current_features return self def transform(self, X, y=None): check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: X = self.feature_encoder_.transform(X) if y is not None: y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() new_X = [] for feature_constructor in self.final_feature_constructors: new_X.append(feature_constructor.transform(X)) return np.concatenate(new_X, axis=1), y
def fit(self, X, y): # Parse input if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) # Reset the stored results for new fit self.reset_evaluation() # Generate rank if self.use_graph: # Construct the minimum graph and create rank graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph( X, y, ("AND", "OR", "XOR")) self.all_feature_constructors = graph.get_rank() elif self.prune is not None: # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y) feature_combinations = list( combinations(list(range(X.shape[1])), 2)) + [(i, i) for i in range(X.shape[1])] rank_pairs = [ symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y) for i, j in feature_combinations ] rank_pairs_index = np.argsort(rank_pairs)[::-1] # Create the unsorted list self.all_feature_constructors = [] for index in rank_pairs_index[:self.prune]: i, j = feature_combinations[index] if i == j: from tfg.feature_construction import create_feature self.all_feature_constructors.extend([ create_feature("OR", [(i, n), (i, m)]) for n, m in combinations(np.unique(X[:, i]), 2) ]) else: self.all_feature_constructors.extend( construct_features(X[:, [i, j]], operators=self.operators, same_feature=False)) else: # Create the unsorted list of all features self.all_feature_constructors = construct_features( X, operators=self.operators) if self.verbose: print( f"Total number of constructed features: {len(self.all_feature_constructors)}" ) self.all_feature_constructors.extend( [DummyFeatureConstructor(j) for j in range(X.shape[1])]) self.symmetrical_uncertainty_rank = [] # Sort the ranking for feature_constructor in self.all_feature_constructors: feature = feature_constructor.transform(X) su = symmetrical_uncertainty(f1=feature.flatten(), f2=y) self.symmetrical_uncertainty_rank.append(su) # Store the descending order index self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1] # If the initial variables are if self.use_initials: classifier = NaiveBayes(encode_data=False, n_intervals=self.n_intervals, metric=self.metric) classifier.fit(X, y) current_features = [ DummyFeatureConstructor(j) for j in range(X.shape[1]) ] # Store the backward result to reuse it for other executions self.initial_backward_features = backward_search( X, y, current_features, classifier) # Feature Subset Selection (FSS) from the rank self.filter_features(X, y) return self
def ranker_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), share_rank=True): result = [] columns = ["Database", "Number of attributes", "NBScore", "NBScore STD", "Ranker Score", "Ranker Score STD", "Configuration", "Combinations", "Selected_attributes", "Original"] dataset_tqdm = tqdm(datasets) # Instantiate the classifier r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_score = np.zeros(shape=(len(params), n_splits*n_repeats)) r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats)) r_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats)) r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats)) rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits*n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers nb.fit(X=X_train, y=y_train) naive_bayes_score = nb.score(X_test, y_test) for conf_index, conf in enumerate(params): seed_tqdm.set_postfix({"config": conf_index}) r.set_params(**conf) # Fit if conf_index == 0 or not share_rank: # The rank is computed from scratch r.fit(X_train, y_train) else: r.filter_features(r.feature_encoder_.transform( X_train), r.class_encoder_.transform(y_train)) # score ranker_score = r.score(X_test, y_test) # Get data n_original_features = len(list(filter(lambda x: isinstance( x, DummyFeatureConstructor), r.final_feature_constructors))) n_combinations = len(r.all_feature_constructors) n_selected = len(r.final_feature_constructors) # Update nb_score[conf_index, i] = naive_bayes_score r_score[conf_index, i] = ranker_score r_combinations[conf_index, i] = n_combinations r_selected[conf_index, i] = n_selected r_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this dataset for conf_index, conf in enumerate(params): row = [name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(r_score[conf_index]), np.std(r_score[conf_index]), conf, np.mean(r_combinations[conf_index]), np.mean(r_selected[conf_index]), np.mean(r_dummy[conf_index])] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results("RANKER", email_data, result) return result
def genetic_score_comparison(datasets, seed, base_path, params, n_splits=3, n_repeats=5, n_intervals=5, metric="accuracy", send_email=False, email_data=dict(), verbose=True, version=1): result = [] columns = [ "Database", "Number of attributes", "NBScore", "NBScore STD", "Genetic Score", "Genetic Score STD", "Configuration", "Selected_attributes", "Original" ] dataset_tqdm = tqdm(datasets) # Instantiate the classifier if version == 1: # First Version - No flexibility in the number of attributes (bad performance) # clf = GeneticProgramming(seed=seed, metric=metric) clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) elif version == 2: # Version with flexibility clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric) else: # Guided mutation based on SU clf = GeneticProgrammingRankMutation(seed=seed, metric=metric) nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric) # Execute algorithm on datasets for database in dataset_tqdm: name, label = database if not os.path.exists(base_path + name): print(f"{name} doesnt' exist") continue # Assume UCI REPO like data test = f"{name}.test.csv" data = f"{name}.data.csv" X, y = get_X_y_from_database(base_path, name, data, test, label) dataset_tqdm.set_postfix({"DATABASE": name}) # Set up data structures to store results nb_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_score = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats)) clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats)) # Create splits for the experiments rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) seed_tqdm = tqdm(rskf.split(X, y), leave=False, total=n_splits * n_repeats, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}' ) if verbose else rskf.split(X, y) # Execute experiments for i, data in enumerate(seed_tqdm): train_index, test_index = data X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Encode the data c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals) X_train = c.fit_transform(X_train) X_test = c.transform(X_test) l = CustomLabelEncoder() y_train = l.fit_transform(y_train) y_test = l.transform(y_test) # Assess the classifiers reusing info to speed up evaluation nb.fit(X_train, y_train) naive_bayes_score = nb.score(X_test, y_test) # Reset evaluation-cache for new split clf.reset_evaluation() for conf_index, conf in enumerate(params): if verbose: seed_tqdm.set_postfix({"config": conf_index}) clf.set_params(**conf) clf.fit(X_train, y_train) # score genetic_score = clf.score(X_test, y_test) # Get data n_original_features = len( list( filter( lambda x: isinstance(x, DummyFeatureConstructor), clf.best_features))) n_selected = len(clf.best_features) # Update nb_score[conf_index, i] = naive_bayes_score clf_score[conf_index, i] = genetic_score clf_selected[conf_index, i] = n_selected clf_dummy[conf_index, i] = n_original_features # Insert to final result averaged metrics for this database for conf_index, conf in enumerate(params): row = [ name, X.shape[1], np.mean(nb_score[conf_index]), np.std(nb_score[conf_index]), np.mean(clf_score[conf_index]), np.std(clf_score[conf_index]), conf, np.mean(clf_selected[conf_index]), np.mean(clf_dummy[conf_index]) ] result.append(row) result = pd.DataFrame(result, columns=columns) if send_email: from tfg.utils import send_results send_results(f"GENETIC_{version}", email_data, result) return result
def fit(self, X, y, init_graph=True): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() self.categories_ = None if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if init_graph: if self.graph_strategy == "full": #Full graph self.afg = AntFeatureGraph(seed=self.seed).compute_graph( X, y, ("XOR", "OR", "AND")) else: #Pruned graph self.afg = AntFeatureGraphMI( seed=self.seed, connections=self.connections).compute_graph( X, y, ("XOR", "OR", "AND")) else: self.afg.reset_pheromones() if self.verbose: print(f"Number of nodes: {len(self.afg.nodes)}") random.seed(self.seed) best_score = 0 self.best_features = [] iterations_without_improvement = 0 iterator = tqdm(range(self.iterations)) if self.verbose else range( self.iterations) beta = self.beta distance_from_best = -1 for iteration in iterator: if self.verbose: iterator.set_postfix({ "best_score": best_score, "n_features": len(self.best_features), "p_matrix_c": len(self.afg.pheromone_construction), "p_matrix_s": len(self.afg.pheromone_selection), "distance_from_best": distance_from_best }) ants = [ Ant(ant_id=i, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) for i in range(self.ants) ] beta *= (1 - self.beta_evaporation_rate) results = [] for ant in ants: results.append( ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel, max_errors=self.max_errors)) results = np.array(results) self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate) distance_from_best = np.mean(np.abs(results - best_score)) best_ant = np.argmax(results) if self.update_strategy == "best": ant = ants[best_ant] self.afg.intensify(ant.current_features, self.intensification_factor, 1, self.use_initials) else: for ant_score, ant in zip(results, ants): self.afg.intensify(ant.current_features, self.intensification_factor, ant_score, self.use_initials) if results[best_ant] >= best_score: iterations_without_improvement = 0 ant = ants[best_ant] best_score = results[best_ant] self.best_features = ant.current_features else: iterations_without_improvement += 1 if iterations_without_improvement > self.early_stopping: break self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) if self.final_selection == "BEST": pass else: #An ant traverses the graph deterministically to obtain the features final_ant = FinalAnt(ant_id=0, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) final_ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel) self.best_features = final_ant.current_features #Train model with final features self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) if self.save_features: #Save to features to dict translate_features(features=self.best_features, feature_encoder=self.feature_encoder_, categories=self.categories_, path=self.path, filename=self.filename) return self
class ACFCS(OptimizationMixin, TransformerMixin, ClassifierMixin, BaseEstimator): def __init__(self, ants=10, evaporation_rate=0.05, intensification_factor=0.05, alpha=1.0, beta=0.0, beta_evaporation_rate=0.05, step=1, iterations=100, early_stopping=20, update_strategy="best", seed=None, parallel=False, save_features=False, path=None, filename=None, verbose=0, graph_strategy="mutual_info", connections=2, max_errors=0, metric="accuracy", use_initials=False, final_selection="ALL", encode_data=True): self.step = step self.ants = ants self.evaporation_rate = evaporation_rate self.intensification_factor = intensification_factor self.alpha = alpha self.beta = beta self.beta_evaporation_rate = beta_evaporation_rate self.iterations = iterations self.early_stopping = early_stopping self.seed = seed self.parallel = parallel self.save_features = save_features self.path = path self.filename = filename self.verbose = verbose self.graph_strategy = graph_strategy self.connections = connections self.metric = metric self.update_strategy = update_strategy self.use_initials = use_initials self.final_selection = final_selection self.encode_data = encode_data self.max_errors = max_errors allowed_graph_strategy = ("full", "mutual_info") if self.graph_strategy not in allowed_graph_strategy: raise ValueError( "Unknown graph strategy type: %s, expected one of %s." % (self.graph_strategy, allowed_graph_strategy)) allowed_update_strategy = ("all", "best") if self.update_strategy not in allowed_update_strategy: raise ValueError( "Unknown graph strategy type: %s, expected one of %s." % (self.update_strategy, allowed_update_strategy)) self.reset_cache() def reset_cache(self): self.cache_loo = dict() self.cache_heuristic = dict() def fit(self, X, y, init_graph=True): self.feature_encoder_ = CustomOrdinalFeatureEncoder() self.class_encoder_ = CustomLabelEncoder() self.categories_ = None if isinstance(X, pd.DataFrame): self.categories_ = X.columns if self.encode_data: X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if init_graph: if self.graph_strategy == "full": #Full graph self.afg = AntFeatureGraph(seed=self.seed).compute_graph( X, y, ("XOR", "OR", "AND")) else: #Pruned graph self.afg = AntFeatureGraphMI( seed=self.seed, connections=self.connections).compute_graph( X, y, ("XOR", "OR", "AND")) else: self.afg.reset_pheromones() if self.verbose: print(f"Number of nodes: {len(self.afg.nodes)}") random.seed(self.seed) best_score = 0 self.best_features = [] iterations_without_improvement = 0 iterator = tqdm(range(self.iterations)) if self.verbose else range( self.iterations) beta = self.beta distance_from_best = -1 for iteration in iterator: if self.verbose: iterator.set_postfix({ "best_score": best_score, "n_features": len(self.best_features), "p_matrix_c": len(self.afg.pheromone_construction), "p_matrix_s": len(self.afg.pheromone_selection), "distance_from_best": distance_from_best }) ants = [ Ant(ant_id=i, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) for i in range(self.ants) ] beta *= (1 - self.beta_evaporation_rate) results = [] for ant in ants: results.append( ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel, max_errors=self.max_errors)) results = np.array(results) self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate) distance_from_best = np.mean(np.abs(results - best_score)) best_ant = np.argmax(results) if self.update_strategy == "best": ant = ants[best_ant] self.afg.intensify(ant.current_features, self.intensification_factor, 1, self.use_initials) else: for ant_score, ant in zip(results, ants): self.afg.intensify(ant.current_features, self.intensification_factor, ant_score, self.use_initials) if results[best_ant] >= best_score: iterations_without_improvement = 0 ant = ants[best_ant] best_score = results[best_ant] self.best_features = ant.current_features else: iterations_without_improvement += 1 if iterations_without_improvement > self.early_stopping: break self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric) if self.final_selection == "BEST": pass else: #An ant traverses the graph deterministically to obtain the features final_ant = FinalAnt(ant_id=0, alpha=self.alpha, beta=beta, metric=self.metric, use_initials=self.use_initials, cache_loo=self.cache_loo, cache_heuristic=self.cache_heuristic, step=self.step) final_ant.run(X=X, y=y, graph=self.afg, random_generator=random, parallel=self.parallel) self.best_features = final_ant.current_features #Train model with final features self.classifier_.fit( np.concatenate( [feature.transform(X) for feature in self.best_features], axis=1), y) if self.save_features: #Save to features to dict translate_features(features=self.best_features, feature_encoder=self.feature_encoder_, categories=self.categories_, path=self.path, filename=self.filename) return self
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30): column_names = ["dataset", "custom_training_score", "custom_test_score", "categorical_training_score", "categorical_test_score"] data =[] clf_no_encoding = NaiveBayes(encode_data=True) clf_categorical_sklearn = CategoricalNB() datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') c = CustomOrdinalFeatureEncoder() l = CustomLabelEncoder() for dataset in datasets_iter: dataset_name, label = dataset data_filename = f"{dataset_name}.data.csv" test_filename = f"{dataset_name}.test.csv" X, y = get_X_y_from_database(base_path=base_path, name = dataset_name, data = data_filename, test = test_filename, label = label) custom_train = [] custom_test = [] sklearn_train = [] sklearn_test = [] X = c.fit_transform(X) y = l.fit_transform(y) for iteration in range(n_iterations): if verbose: datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration}) datasets_iter.refresh() try: X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True, stratify=y) except: #Not enough values to stratify y X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True ) #Fit clf_no_encoding.fit(X_train,y_train) clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])] clf_categorical_sklearn.fit(X_train,y_train) #Predict custom_train.append(clf_no_encoding.score(X_train,y_train)) custom_test.append(clf_no_encoding.score(X_test,y_test)) sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train)) sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test)) data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)]) return pd.DataFrame(data,columns = column_names)
class NaiveBayes(ClassifierMixin, BaseEstimator): """A Naive Bayes classifier. Simple NaiveBayes classifier accepting non-encoded input, enhanced with numba using MAP to predict most likely class. Parameters ---------- alpha : {float, array-like}, default=1.0 Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). If it is an array it is expected to have the same size as number of attributes encode_data : bool, default=True Encode data when data is not encoded by default with an OrdinalEncoder discretize : bool, default=True Discretize numerical data n_intervals : int or None, default=5 Discretize numerical data using the specified number of intervals Attributes ---------- feature_encoder_ : CustomOrdinalFeatureEncoder or None Encodes data in ordinal way with unseen values handling if encode_data is set to True. class_encoder_ : LabelEncoder or None Encodes Data in ordinal way for the class if encode_data is set to True. n_samples_ : int Number of samples n_features_ : int Number of features n_classes_ : int Number of classes class_values_ : array-like of shape (n_classes_,) Array containing the values of the classes, as ordinal encoding is assumed it will be an array ranging from 0 to largest value for the class class_count_ : array-like of shape (n_classes_,) Array where `class_count_[i]` contains the count of the ith class value. class_log_count_ : array-like of shape (n_classes_,) Array where `class_count_[i]` contains the log count of the ith class value. feature_values_count_per_element_ : array-like of shape (column_count,~) Array where `feature_values_count_per_element_[i]` is an array where `feature_values_count_per_element_[i][j]` contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0 feature_values_count_ : array-like of shape (column_count,) Array where `feature_values_count_per_element_[i]` is an integer with the number of possible values for the ith feature. feature_unique_values_count_ : array-like of shape (column_count,) Array where `feature_unique_values_count_[i]` is an integer with the number of unique seen values for the ith feature at fitting time. This is needed to compute the smoothing. total_probability_ : array-like of shape (n_classes,) Smoothing factor to be applied to the prediction. Array where `total_probability_[i]` if equal to class_count_[i] + alpha*feature_unique_values_count_ self.indepent_term_ : array-like of shape (n_classes,) Independent term computed at fitting time. It includes the smoothing factor to be applied to the prediction and the apriori probability. probabilities_ : array-like of shape (column_count,~) Array where `feature_values_count_per_element_[i]` is an array of shape (where `feature_values_count_per_element_[i][j]` contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0 """ def __init__(self, alpha=1.0, encode_data=True, n_intervals=5, discretize=True, metric="accuracy"): self.alpha = alpha self.encode_data = encode_data self.n_intervals = n_intervals self.discretize = discretize self.metric = metric self._get_scorer() super().__init__() def _get_scorer(self): self.scorer = get_scorer(self.metric) if self.metric == "f1_score": # Unseen values for target class may cause errors self.scorer = lambda y_true, y_pred: get_scorer(self.metric)( y_true=y_true, y_pred=y_pred, average="macro", zero_division=0) def set_params(self, **params): super().set_params(**params) self._get_scorer() def _compute_independent_terms(self): """Computes the terms that are indepent of the prediction""" self.total_probability_ = compute_total_probability_( self.class_count_, self.feature_unique_values_count_, self.alpha) # self.total_probability_ = compute_total_probability_(self.class_count_,self.feature_values_count_,self.alpha) #-->scikit uses this self.indepent_term_ = self.class_log_count_smoothed_ - self.total_probability_ def _compute_class_counts(self, X: np.ndarray, y: np.ndarray): """Computes the counts for the priors""" self.n_classes_ = 1 + np.max(y) self.class_count_ = np.bincount(y) self.class_log_count_ = np.log(self.class_count_) self.class_count_smoothed_ = (self.class_count_ + self.alpha) self.class_log_count_smoothed_ = np.log(self.class_count_smoothed_) def _compute_feature_counts(self, X: np.ndarray, y: np.ndarray): """Computes the conditional smoothed counts for each feature""" tables = _get_tables(X, y, self.n_classes_, self.alpha) self.smoothed_counts_ = tables[0] self.smoothed_log_counts_ = tables[1] self.feature_values_count_ = tables[2] self.feature_values_count_per_element_ = tables[3] self.feature_unique_values_count_ = tables[4] def fit(self, X: np.ndarray, y: np.ndarray): """ Fits the classifier with trainning data. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. Returns ------- self : object """ if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: self.feature_encoder_ = CustomOrdinalFeatureEncoder( n_intervals=self.n_intervals, discretize=self.discretize) self.class_encoder_ = CustomLabelEncoder() X = self.feature_encoder_.fit_transform(X) y = self.class_encoder_.fit_transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) if X.dtype != int: X = X.astype(int) if y.dtype != int: y = y.astype(int) self.n_samples_, self.n_features_ = X.shape self._compute_class_counts(X, y) self._compute_feature_counts(X, y) self._compute_independent_terms() return self def predict(self, X: np.ndarray): """ Predicts the label of the samples based on the MAP. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True Returns ------- y : array-like of shape (n_samples) Predicted label for each sample. """ check_is_fitted(self) if self.encode_data: X = self.feature_encoder_.transform(X) if isinstance(X, pd.DataFrame): X = X.to_numpy() if X.dtype != int: X = X.astype(int) check_array(X) log_probabilities = _predict(X, self.smoothed_log_counts_, self.feature_values_count_, self.alpha) log_probabilities += self.indepent_term_ output = np.argmax(log_probabilities, axis=1) if self.encode_data: output = self.class_encoder_.inverse_transform(output) return output def predict_proba(self, X: np.ndarray): """ Predicts the probability for each label of the samples based on the MAP. Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True Returns ------- y : array-like of shape (n_classes,n_samples) Array where `y[i][j]` contains the MAP of the jth class for ith sample """ check_is_fitted(self) if X.dtype != int: X = X.astype(int) if self.encode_data: X = self.feature_encoder_.transform(X) if isinstance(X, pd.DataFrame): X = X.to_numpy() log_probabilities = _predict(X, self.smoothed_log_counts_, self.feature_values_count_, self.alpha) log_probabilities += self.indepent_term_ log_prob_x = logsumexp(log_probabilities, axis=1) return np.exp(log_probabilities - np.atleast_2d(log_prob_x).T) def leave_one_out_cross_val(self, X, y, fit=True): """Efficient LOO computation""" if fit: self.fit(X, y) if self.encode_data: X = self.feature_encoder_.transform(X) y = self.class_encoder_.transform(y) if isinstance(X, pd.DataFrame): X = X.to_numpy() if X.dtype != int: X = X.astype(int) if y.dtype != int: X = X.astype(int) log_alpha = np.log(self.alpha) log_proba = np.zeros((X.shape[0], self.n_classes_)) for i in range(X.shape[0]): example, label = X[i], y[i] class_count_ = self.class_count_.copy() class_count_[label] -= 1 log_proba[i] = np.log(class_count_ + self.alpha) for j in range(X.shape[1]): p = self.smoothed_log_counts_[j][example[j]].copy() p[label] = np.log( np.max([ self.smoothed_counts_[j][example[j]][label] - 1, self.alpha ])) log_proba[i] += p if self.feature_values_count_per_element_[j][example[j]] == 1: update_value = np.log(class_count_ + ( self.feature_unique_values_count_[j] - 1) * self.alpha) else: update_value = np.log( class_count_ + (self.feature_unique_values_count_[j]) * self.alpha) log_proba[i] -= np.where(update_value == np.NINF, 0, update_value) y_pred = np.argmax(log_proba, axis=1) return self.scorer(y, y_pred) def add_features(self, X, y, index=None): """Updates classifier with new features Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. index: {None,array-like of shape (X.shape[1])} Indicates where to insert each new feature, if it is None they are all appended at the very end. Returns ------- self : object """ check_is_fitted(self) if isinstance(y, pd.DataFrame): y = y.to_numpy() if self.encode_data: # y should be the same than the one that was first fitted for now ----> FUTURE IMPLEMENTATION y = self.class_encoder_.transform(y) X = self.feature_encoder_.add_features(X, transform=True, index=index) if isinstance(X, pd.DataFrame): X = X.to_numpy() check_X_y(X, y) if X.dtype != int: X = X.astype(int) if y.dtype != int: X = X.astype(int) self.n_features_ += X.shape[1] tables = _get_tables(X, y, self.n_classes_, self.alpha) new_smoothed_counts = tables[0] new_smoothed_log_counts = tables[1] new_feature_value_counts = tables[2] new_feature_value_counts_per_element = tables[3] new_feature_unique_values_count_ = tables[4] new_feature_contribution = compute_total_probability_( self.class_count_, new_feature_unique_values_count_, self.alpha) if index: sort_index = np.argsort(index) index_with_column = list(enumerate(index)) for i in sort_index: column, list_insert_index = index_with_column[i] self.feature_values_count_per_element_.insert( list_insert_index, new_feature_value_counts_per_element[column]) self.feature_values_count_ = np.insert( self.feature_values_count_, list_insert_index, new_feature_value_counts[column]) self.smoothed_counts_.insert(list_insert_index, new_smoothed_counts[column]) self.smoothed_log_counts_.insert( list_insert_index, new_smoothed_log_counts[column]) self.feature_unique_values_count_ = np.insert( self.feature_unique_values_count_, list_insert_index, new_feature_unique_values_count_[column]) else: self.feature_values_count_per_element_.extend( new_feature_value_counts_per_element) self.feature_values_count_ = np.concatenate( [self.feature_values_count_, new_feature_value_counts]) self.smoothed_counts_.extend(new_smoothed_counts) self.smoothed_log_counts_.extend(new_smoothed_log_counts) self.feature_unique_values_count_ = np.concatenate([ self.feature_unique_values_count_, new_feature_unique_values_count_ ]) self.total_probability_ += new_feature_contribution self.indepent_term_ -= new_feature_contribution return self def remove_feature(self, index): """Updates classifierby removing one feature (index)""" check_is_fitted(self) if self.n_features_ <= 1: raise Exception("Cannot remove only feature from classifier") if not 0 <= index < self.n_features_: raise Exception( f"Feature index not valid, expected index between 0 and {self.n_features_}" ) self.n_features_ -= 1 feature_contribution = self.class_count_ + self.alpha * self.feature_unique_values_count_[ index] feature_contribution = np.log(feature_contribution) self.total_probability_ -= feature_contribution self.indepent_term_ += feature_contribution self.feature_unique_values_count_ = np.delete( self.feature_unique_values_count_, index) self.feature_values_count_ = np.delete(self.feature_values_count_, index) del self.feature_values_count_per_element_[index] del self.smoothed_counts_[index] del self.smoothed_log_counts_[index] if self.encode_data: self.feature_encoder_.remove_feature(index) return self def score(self, X: np.ndarray, y: np.ndarray): """Computes the accuracy Parameters ---------- X : array-like of shape (n_samples, n_features_) Training array that must be encoded unless encode_data is set to True y : array-like of shape (n_samples,) Label of the class associated to each sample. Returns ------- score : float Percentage of correctly classified instances """ y_pred = self.predict(X) return self.scorer(y, y_pred)