Пример #1
0
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1):
    if not X:
        X = np.array([
            ["P", "+"],
            ["P2", "-"],
            ["P3", "-"],
        ])

    custom_encoder = CustomOrdinalFeatureEncoder()
    ordinal_encoder = OrdinalEncoder()

    ordinal_encoder_time = []
    custom_encoder_time = []
    for i in range(iterations):
        ts = time()
        custom_encoder.fit(X)
        transformed = custom_encoder.transform(X)
        custom_encoder.inverse_transform(transformed)
        custom_encoder_time.append(time() - ts)

        ts = time()
        ordinal_encoder.fit(X)
        transformed = ordinal_encoder.transform(X)
        ordinal_encoder.inverse_transform(transformed)
        ordinal_encoder_time.append(time() - ts)
    custom_encoder_time = np.mean(custom_encoder_time)
    ordinal_encoder_time = np.mean(ordinal_encoder_time)
    if verbose:
        print(f"CustomEncoder -> Time: {custom_encoder_time}")
        print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}")
    return custom_encoder_time, ordinal_encoder_time
Пример #2
0
def acfs_score_comparison(datasets,
                          seed,
                          base_path,
                          params,
                          n_splits=3,
                          n_repeats=5,
                          n_intervals=5,
                          metric="accuracy",
                          send_email=False,
                          email_data=dict(),
                          verbose=True):
    # List to store results and column names for the csv
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes",
        "Contruction Matrix", "Selection Matrix", "Selected_attributes",
        "Original"
    ]
    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    acfcs = ACFCS(verbose=0, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        # Update progressbar
        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selection_matrix = np.zeros(shape=(len(params),
                                                 n_splits * n_repeats))
        acfcs_construction_matrix = np.zeros(shape=(len(params),
                                                    n_splits * n_repeats))
        acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)
            acfcs.reset_cache()
            for conf_index, conf in enumerate(params):
                acfcs.set_params(**conf)
                acfcs.fit(X_train, y_train, init_graph=conf_index == 0)

                # score
                acfcs_score_conf = acfcs.score(X_test, y_test)
                if verbose:
                    seed_tqdm.set_postfix({
                        "config": conf_index,
                        "nb_score": naive_bayes_score,
                        "ant_score": acfcs_score_conf
                    })

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            acfcs.best_features)))
                n_selected = len(acfcs.best_features)
                selection_matrix = len(acfcs.afg.pheromone_selection)
                construction_matrix = len(acfcs.afg.pheromone_construction)
                nodes = len(acfcs.afg.nodes)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                acfcs_score[conf_index, i] = acfcs_score_conf
                acfcs_selection_matrix[conf_index, i] = selection_matrix
                acfcs_construction_matrix[conf_index, i] = construction_matrix
                acfcs_nodes[conf_index, i] = nodes
                acfcs_dummy[conf_index, i] = n_original_features
                acfcs_selected[conf_index, i] = n_selected

        # Insert the final result - averaged metrics for this database.
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(acfcs_score[conf_index]),
                np.std(acfcs_score[conf_index]), conf,
                np.mean(acfcs_nodes[conf_index]),
                np.mean(acfcs_construction_matrix[conf_index]),
                np.mean(acfcs_selection_matrix[conf_index]),
                np.mean(acfcs_selected[conf_index]),
                np.mean(acfcs_dummy[conf_index])
            ]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)

    if send_email:
        from tfg.utils import send_results
        send_results("ACFCS", email_data, result)
    return result
Пример #3
0
class RankerLogicalFeatureConstructor(TransformerMixin, ClassifierMixin,
                                      BaseEstimator):
    """First proposal: Hybrid-Ranker Wrapper.

    Build a ranking based on Symmetrical Uncertainty (SU) of every possible logical feature of depth 1
    (1 operator, 2 operands), using XOR, AND and OR operator. The steps are:
        - Find out combinations of values in database of every pair of features Xi, Xj:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a'),(2,'b'),(3,'c'),(2,'a')]
        - Apply operator to every combination:
            - Example: 
                - Xi = [1,2,3,2]
                - Xj = ['a','b','c','a']
                Possible combinations:
                    [(1,'a','AND'),(2,'b','AND'),(3,'c','AND'),(2,'a','AND'),
                    (1,'a','OR'),(2,'b','OR'),(3,'c','OR'),(2,'a','OR'),
                    (1,'a','XOR'),(2,'b','XOR'),(3,'c','XOR'),(2,'a','XOR')]
        - Add original variables to the list
        - Evaluate SU for every value in the list, and rank them
        - Go over the list following one of the two strategies proposed and evaluate 
          the subset based on a leave-one-out cross-validation with the NaiveBayes classifier.

    Parameters
    ----------
    strategy : str {eager,skip}
        After the ranking is built if the eager strategy is chosen we stop considering attributes
        when there is no improvement from one iteration to the next

    block_size : int, default=1
        Number of features that are added in each iteration

    encode_data : boolean
        Whether or not to encode the received data. If set to false the classifier 
        expects data to be encoded with an ordinal encoder.

    verbose : {boolean,int}
        If set to true it displays information of the remaining time 
        and inside variables.

    operators : array-like, deafult = ("XOR","AND","OR")
        Operators used for the constructed features.

    max_features : int, deafult = inf
        Maximum number of features to include in the selected subset

    max_iterations : int, deafult = inf
        Maximum number of iterations in the wrapper step.

    use_graph : bool, default = False 
        Generate Ranking from features obtained from the pruned-graph of the ACO algorithm.
        (Experimentation not carried out)

    use_initials: bool, default = False
        Force the set of initial features in the final solution. The set if trimmed with a backward elimination before-hand.

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    all_feature_constructors: array-like
        List of FeatureConstructor objects with all the possible logical 
        features

    symmetrical_uncertainty_rank: array-like
        SU for every feature in all_feature_constructors

    rank : array-like
        Array of indexes corresponding to the sorted SU rank (in descending order).

    final_feature_constructors:
        Selected feature subset (list of constructors)

    classifier: NaiveBayes
        Classifier used in the wrapper and to perform predictions after fitting.

    """
    def __init__(self,
                 strategy="eager",
                 block_size=10,
                 encode_data=True,
                 n_intervals=5,
                 verbose=0,
                 operators=("AND", "OR", "XOR"),
                 max_features=float("inf"),
                 max_iterations=float("inf"),
                 metric="accuracy",
                 use_initials=False,
                 max_err=0,
                 prune=None,
                 use_graph=False):
        self.strategy = strategy
        self.block_size = max(block_size, 1)
        self.encode_data = encode_data
        self.verbose = verbose
        self.operators = operators
        self.max_features = max_features
        self.max_iterations = max_iterations
        self.n_intervals = n_intervals
        self.metric = metric
        self.max_err = max_err
        self.use_initials = use_initials
        self.prune = prune
        self.use_graph = use_graph

        allowed_strategies = ("eager", "skip")
        if self.strategy not in allowed_strategies:
            raise ValueError("Unknown operator type: %s, expected one of %s." %
                             (self.strategy, allowed_strategies))

    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self

    def predict(self, X):
        X, _ = self.transform(X)
        if self.encode_data:
            return self.class_encoder_.inverse_transform(
                self.classifier.predict(X))
        return self.classifier.predict(X)

    def reset_evaluation(self):
        # Reset the memoize evaluations
        self.evaluate_leave_one_out_cross_val = memoize(evaluate_leave_one_out)

    def predict_proba(self, X):
        X, _ = self.transform(X)
        return self.classifier.predict_proba(X)

    def score(self, X, y):
        X, y = self.transform(X, y)
        return self.classifier.score(X, y)

    def filter_features(self, X, y):
        '''After the rank is built this perform the greedy wrapper search'''
        check_is_fitted(self)
        self.classifier = NaiveBayes(encode_data=False,
                                     n_intervals=self.n_intervals,
                                     metric=self.metric)
        current_score = np.NINF
        first_iteration = True
        current_features = []
        current_data = None
        if self.use_initials:
            # Original Features have already been taken into account
            rank_iter = filter(
                lambda x: not isinstance(self.all_feature_constructors[x],
                                         DummyFeatureConstructor),
                iter(self.rank))

            # Deep copy to avoid issues when modifying the list
            current_features = deepcopy(self.initial_backward_features)
            current_data = np.concatenate(
                [f.transform(X) for f in current_features], axis=1)

            # Get initial LOO score
            current_score = self.evaluate_leave_one_out_cross_val(
                self.classifier, current_features, current_data, y, fit=True)
        else:
            # Iterator over the sorted list of indexes
            rank_iter = iter(self.rank)

        if self.verbose:
            progress_bar = tqdm(total=len(self.rank),
                                bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        iteration = 0
        iterations_without_improvements = 0

        # Loop for including {block size} elements at a time
        # Rank is an iterator, so the for loop is not sequential!
        for feature_constructor_index in rank_iter:
            iteration += 1
            if self.verbose:
                progress_bar.set_postfix({
                    "n_features": len(current_features),
                    "score": current_score
                })
                progress_bar.update(1)
                progress_bar.refresh()

            # Add block size features
            new_X = [
                self.all_feature_constructors[feature_constructor_index].
                transform(X)
            ]
            selected_features = [
                self.all_feature_constructors[feature_constructor_index]
            ]
            for _ in range(self.block_size - 1):
                try:
                    index = next(rank_iter)
                    selected_features.append(
                        self.all_feature_constructors[index])
                    new_X.append(
                        self.all_feature_constructors[index].transform(X))
                    if self.verbose:
                        progress_bar.update(1)
                        progress_bar.refresh()
                except:
                    # Block size does not divide the number of elements in the rank. The search is halted
                    break

            # Evaluate features
            new_X = np.concatenate(new_X, axis=1)
            if iteration == 1 and not self.use_initials:
                current_data = new_X
                current_score = self.evaluate_leave_one_out_cross_val(
                    self.classifier,
                    selected_features,
                    current_data,
                    y,
                    fit=True)
                current_features = selected_features
                first_iteration = False
                if self.max_iterations <= iteration or (
                        len(current_features) +
                        self.block_size) > self.max_features:
                    break
                continue
            data = np.concatenate([current_data, new_X], axis=1)
            self.classifier.add_features(new_X, y)
            # LOO evaluation
            score = self.evaluate_leave_one_out_cross_val(self.classifier,
                                                          current_features +
                                                          selected_features,
                                                          data,
                                                          y,
                                                          fit=False)
            if score > current_score:
                current_score = score
                current_data = data
                current_features.extend(selected_features)
                iterations_without_improvements = 0
            else:
                iterations_without_improvements += 1
                # Remove last added block
                for feature_index_to_remove in range(
                        data.shape[1], data.shape[1] - new_X.shape[1], -1):
                    self.classifier.remove_feature(feature_index_to_remove - 1)
                if self.strategy == "eager" and self.max_err < iterations_without_improvements:
                    # Stops as soon as no impovement
                    break

            if self.max_iterations <= iteration or (
                    len(current_features) +
                    self.block_size) > self.max_features:
                break
        if self.verbose:
            progress_bar.close()
            print(
                f"\nFinal number of included features: {len(current_features)} - Final Score: {current_score}"
            )
        self.final_feature_constructors = current_features
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            if y is not None:
                y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        new_X = []
        for feature_constructor in self.final_feature_constructors:
            new_X.append(feature_constructor.transform(X))
        return np.concatenate(new_X, axis=1), y
Пример #4
0
def ranker_score_comparison(datasets,
                            seed,
                            base_path,
                            params,
                            n_splits=3,
                            n_repeats=5,
                            n_intervals=5,
                            metric="accuracy",
                            send_email=False,
                            email_data=dict(),
                            share_rank=True):
    result = []
    columns = ["Database",
               "Number of attributes",
               "NBScore",
               "NBScore STD",
               "Ranker Score",
               "Ranker Score STD",
               "Configuration",
               "Combinations",
               "Selected_attributes",
               "Original"]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats))

        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits*n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers
            nb.fit(X=X_train, y=y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            for conf_index, conf in enumerate(params):
                seed_tqdm.set_postfix({"config": conf_index})
                r.set_params(**conf)
                # Fit
                if conf_index == 0 or not share_rank:
                    # The rank is computed from scratch
                    r.fit(X_train, y_train)
                else:
                    r.filter_features(r.feature_encoder_.transform(
                        X_train), r.class_encoder_.transform(y_train))

                # score
                ranker_score = r.score(X_test, y_test)

                # Get data
                n_original_features = len(list(filter(lambda x: isinstance(
                    x, DummyFeatureConstructor), r.final_feature_constructors)))
                n_combinations = len(r.all_feature_constructors)
                n_selected = len(r.final_feature_constructors)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                r_score[conf_index, i] = ranker_score
                r_combinations[conf_index, i] = n_combinations
                r_selected[conf_index, i] = n_selected
                r_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this dataset
        for conf_index, conf in enumerate(params):
            row = [name,
                   X.shape[1],
                   np.mean(nb_score[conf_index]),
                   np.std(nb_score[conf_index]),
                   np.mean(r_score[conf_index]),
                   np.std(r_score[conf_index]),
                   conf,
                   np.mean(r_combinations[conf_index]),
                   np.mean(r_selected[conf_index]),
                   np.mean(r_dummy[conf_index])]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results("RANKER", email_data, result)
    return result
Пример #5
0
def genetic_score_comparison(datasets,
                             seed,
                             base_path,
                             params,
                             n_splits=3,
                             n_repeats=5,
                             n_intervals=5,
                             metric="accuracy",
                             send_email=False,
                             email_data=dict(),
                             verbose=True,
                             version=1):
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "Genetic Score", "Genetic Score STD", "Configuration",
        "Selected_attributes", "Original"
    ]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    if version == 1:
        # First Version - No flexibility in the number of attributes (bad performance)
        # clf = GeneticProgramming(seed=seed, metric=metric)
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    elif version == 2:
        # Version with flexibility
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    else:
        # Guided mutation based on SU
        clf = GeneticProgrammingRankMutation(seed=seed, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            # Reset evaluation-cache for new split
            clf.reset_evaluation()
            for conf_index, conf in enumerate(params):
                if verbose:
                    seed_tqdm.set_postfix({"config": conf_index})
                clf.set_params(**conf)
                clf.fit(X_train, y_train)

                # score
                genetic_score = clf.score(X_test, y_test)

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            clf.best_features)))
                n_selected = len(clf.best_features)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                clf_score[conf_index, i] = genetic_score
                clf_selected[conf_index, i] = n_selected
                clf_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this database
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(clf_score[conf_index]),
                np.std(clf_score[conf_index]), conf,
                np.mean(clf_selected[conf_index]),
                np.mean(clf_dummy[conf_index])
            ]
            result.append(row)

    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results(f"GENETIC_{version}", email_data, result)
    return result
Пример #6
0
class NaiveBayes(ClassifierMixin, BaseEstimator):
    """A Naive Bayes classifier.

    Simple NaiveBayes classifier accepting non-encoded input, enhanced with numba using MAP
    to predict most likely class.

    Parameters
    ----------
    alpha : {float, array-like}, default=1.0
        Additive (Laplace/Lidstone) smoothing parameter
        (0 for no smoothing). If it is an array it is 
        expected to have the same size as number of attributes

    encode_data : bool, default=True
        Encode data when data is not encoded by default with an OrdinalEncoder

    discretize : bool, default=True
        Discretize numerical data

    n_intervals : int or None, default=5
        Discretize numerical data using the specified number of intervals

    Attributes
    ----------
    feature_encoder_ : CustomOrdinalFeatureEncoder or None
        Encodes data in ordinal way with unseen values handling if encode_data is set to True.

    class_encoder_ : LabelEncoder or None
        Encodes Data in ordinal way for the class if encode_data is set to True.

    n_samples_ : int
        Number of samples  

    n_features_ : int
        Number of features

    n_classes_ : int
        Number of classes

    class_values_ : array-like of shape (n_classes_,)
        Array containing the values of the classes, as ordinal encoding is assumed it will be an array
        ranging from 0 to largest value for the class

    class_count_ : array-like of shape (n_classes_,)
        Array where `class_count_[i]` contains the count of the ith class value. 

    class_log_count_ : array-like of shape (n_classes_,)
        Array where `class_count_[i]` contains the log count of the ith class value. 

    feature_values_count_per_element_ : array-like of shape (column_count,~)
        Array where `feature_values_count_per_element_[i]` is an array where `feature_values_count_per_element_[i][j]`
        contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0

    feature_values_count_ : array-like of shape (column_count,)
        Array where `feature_values_count_per_element_[i]` is an integer with the number of possible values for the ith feature.

    feature_unique_values_count_ : array-like of shape (column_count,)
        Array where `feature_unique_values_count_[i]` is an integer with the number of unique seen values for the ith feature at
        fitting time. This is needed to compute the smoothing.

    total_probability_ : array-like of shape (n_classes,)
        Smoothing factor to be applied to the prediction. Array where `total_probability_[i]` if equal to
        class_count_[i] + alpha*feature_unique_values_count_

    self.indepent_term_ : array-like of shape (n_classes,)
        Independent term computed at fitting time. It includes the smoothing factor to be applied to the prediction and 
        the apriori probability.

    probabilities_ : array-like of shape (column_count,~)
        Array where `feature_values_count_per_element_[i]` is an array  of shape (where `feature_values_count_per_element_[i][j]`
        contains the count of the jth value for the ith feature. Assuming ordinal encoding, some values might be equal to 0
    """
    def __init__(self,
                 alpha=1.0,
                 encode_data=True,
                 n_intervals=5,
                 discretize=True,
                 metric="accuracy"):
        self.alpha = alpha
        self.encode_data = encode_data
        self.n_intervals = n_intervals
        self.discretize = discretize
        self.metric = metric
        self._get_scorer()
        super().__init__()

    def _get_scorer(self):
        self.scorer = get_scorer(self.metric)
        if self.metric == "f1_score":  # Unseen values for target class may cause errors
            self.scorer = lambda y_true, y_pred: get_scorer(self.metric)(
                y_true=y_true, y_pred=y_pred, average="macro", zero_division=0)

    def set_params(self, **params):
        super().set_params(**params)
        self._get_scorer()

    def _compute_independent_terms(self):
        """Computes the terms that are indepent of the prediction"""
        self.total_probability_ = compute_total_probability_(
            self.class_count_, self.feature_unique_values_count_, self.alpha)
        # self.total_probability_ = compute_total_probability_(self.class_count_,self.feature_values_count_,self.alpha) #-->scikit uses this
        self.indepent_term_ = self.class_log_count_smoothed_ - self.total_probability_

    def _compute_class_counts(self, X: np.ndarray, y: np.ndarray):
        """Computes the counts for the priors"""
        self.n_classes_ = 1 + np.max(y)
        self.class_count_ = np.bincount(y)
        self.class_log_count_ = np.log(self.class_count_)
        self.class_count_smoothed_ = (self.class_count_ + self.alpha)
        self.class_log_count_smoothed_ = np.log(self.class_count_smoothed_)

    def _compute_feature_counts(self, X: np.ndarray, y: np.ndarray):
        """Computes the conditional smoothed counts for each feature"""
        tables = _get_tables(X, y, self.n_classes_, self.alpha)
        self.smoothed_counts_ = tables[0]
        self.smoothed_log_counts_ = tables[1]
        self.feature_values_count_ = tables[2]
        self.feature_values_count_per_element_ = tables[3]
        self.feature_unique_values_count_ = tables[4]

    def fit(self, X: np.ndarray, y: np.ndarray):
        """ Fits the classifier with trainning data.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features_)
            Training array that must be encoded unless
            encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        Returns
        -------
        self : object
        """
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals, discretize=self.discretize)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            y = y.astype(int)
        self.n_samples_, self.n_features_ = X.shape
        self._compute_class_counts(X, y)
        self._compute_feature_counts(X, y)
        self._compute_independent_terms()
        return self

    def predict(self, X: np.ndarray):
        """ Predicts the label of the samples based on the MAP.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        Returns
        -------
        y : array-like of shape (n_samples)
            Predicted label for each sample.
        """
        check_is_fitted(self)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        if X.dtype != int:
            X = X.astype(int)
        check_array(X)
        log_probabilities = _predict(X, self.smoothed_log_counts_,
                                     self.feature_values_count_, self.alpha)
        log_probabilities += self.indepent_term_
        output = np.argmax(log_probabilities, axis=1)
        if self.encode_data:
            output = self.class_encoder_.inverse_transform(output)
        return output

    def predict_proba(self, X: np.ndarray):
        """ Predicts the probability for each label of the samples based on the MAP.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        Returns
        -------
        y : array-like of shape (n_classes,n_samples)
            Array where `y[i][j]` contains the MAP of the jth class for ith
            sample
        """
        check_is_fitted(self)
        if X.dtype != int:
            X = X.astype(int)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        log_probabilities = _predict(X, self.smoothed_log_counts_,
                                     self.feature_values_count_, self.alpha)
        log_probabilities += self.indepent_term_
        log_prob_x = logsumexp(log_probabilities, axis=1)
        return np.exp(log_probabilities - np.atleast_2d(log_prob_x).T)

    def leave_one_out_cross_val(self, X, y, fit=True):
        """Efficient LOO computation"""
        if fit:
            self.fit(X, y)
        if self.encode_data:
            X = self.feature_encoder_.transform(X)
            y = self.class_encoder_.transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()

        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            X = X.astype(int)
        log_alpha = np.log(self.alpha)
        log_proba = np.zeros((X.shape[0], self.n_classes_))
        for i in range(X.shape[0]):
            example, label = X[i], y[i]
            class_count_ = self.class_count_.copy()
            class_count_[label] -= 1
            log_proba[i] = np.log(class_count_ + self.alpha)
            for j in range(X.shape[1]):
                p = self.smoothed_log_counts_[j][example[j]].copy()
                p[label] = np.log(
                    np.max([
                        self.smoothed_counts_[j][example[j]][label] - 1,
                        self.alpha
                    ]))
                log_proba[i] += p
                if self.feature_values_count_per_element_[j][example[j]] == 1:
                    update_value = np.log(class_count_ + (
                        self.feature_unique_values_count_[j] - 1) * self.alpha)
                else:
                    update_value = np.log(
                        class_count_ +
                        (self.feature_unique_values_count_[j]) * self.alpha)
                log_proba[i] -= np.where(update_value == np.NINF, 0,
                                         update_value)
        y_pred = np.argmax(log_proba, axis=1)
        return self.scorer(y, y_pred)

    def add_features(self, X, y, index=None):
        """Updates classifier with new features

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        index: {None,array-like of shape (X.shape[1])}
                Indicates where to insert each new feature, if it is None
                they are all appended at the very end.
        Returns
        -------
        self : object
        """
        check_is_fitted(self)
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            # y should be the same than the one that was first fitted for now  ----> FUTURE IMPLEMENTATION
            y = self.class_encoder_.transform(y)
            X = self.feature_encoder_.add_features(X,
                                                   transform=True,
                                                   index=index)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            X = X.astype(int)

        self.n_features_ += X.shape[1]
        tables = _get_tables(X, y, self.n_classes_, self.alpha)
        new_smoothed_counts = tables[0]
        new_smoothed_log_counts = tables[1]
        new_feature_value_counts = tables[2]
        new_feature_value_counts_per_element = tables[3]
        new_feature_unique_values_count_ = tables[4]
        new_feature_contribution = compute_total_probability_(
            self.class_count_, new_feature_unique_values_count_, self.alpha)
        if index:
            sort_index = np.argsort(index)
            index_with_column = list(enumerate(index))
            for i in sort_index:
                column, list_insert_index = index_with_column[i]
                self.feature_values_count_per_element_.insert(
                    list_insert_index,
                    new_feature_value_counts_per_element[column])
                self.feature_values_count_ = np.insert(
                    self.feature_values_count_, list_insert_index,
                    new_feature_value_counts[column])
                self.smoothed_counts_.insert(list_insert_index,
                                             new_smoothed_counts[column])
                self.smoothed_log_counts_.insert(
                    list_insert_index, new_smoothed_log_counts[column])
                self.feature_unique_values_count_ = np.insert(
                    self.feature_unique_values_count_, list_insert_index,
                    new_feature_unique_values_count_[column])
        else:
            self.feature_values_count_per_element_.extend(
                new_feature_value_counts_per_element)
            self.feature_values_count_ = np.concatenate(
                [self.feature_values_count_, new_feature_value_counts])
            self.smoothed_counts_.extend(new_smoothed_counts)
            self.smoothed_log_counts_.extend(new_smoothed_log_counts)
            self.feature_unique_values_count_ = np.concatenate([
                self.feature_unique_values_count_,
                new_feature_unique_values_count_
            ])

        self.total_probability_ += new_feature_contribution
        self.indepent_term_ -= new_feature_contribution

        return self

    def remove_feature(self, index):
        """Updates classifierby removing one feature (index)"""
        check_is_fitted(self)
        if self.n_features_ <= 1:
            raise Exception("Cannot remove only feature from classifier")
        if not 0 <= index < self.n_features_:
            raise Exception(
                f"Feature index not valid, expected index between 0 and {self.n_features_}"
            )
        self.n_features_ -= 1

        feature_contribution = self.class_count_ + self.alpha * self.feature_unique_values_count_[
            index]
        feature_contribution = np.log(feature_contribution)
        self.total_probability_ -= feature_contribution
        self.indepent_term_ += feature_contribution

        self.feature_unique_values_count_ = np.delete(
            self.feature_unique_values_count_, index)
        self.feature_values_count_ = np.delete(self.feature_values_count_,
                                               index)
        del self.feature_values_count_per_element_[index]
        del self.smoothed_counts_[index]
        del self.smoothed_log_counts_[index]

        if self.encode_data:
            self.feature_encoder_.remove_feature(index)
        return self

    def score(self, X: np.ndarray, y: np.ndarray):
        """Computes the accuracy

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features_)
           Training array that must be encoded unless
           encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.
        Returns
        -------
        score : float
                Percentage of correctly classified instances
        """
        y_pred = self.predict(X)
        return self.scorer(y, y_pred)