Пример #1
0
    def fit(self, X, y):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        classifier_ = NaiveBayes(encode_data=False,
                                 n_intervals=self.n_intervals,
                                 metric=self.metric)
        self.n_features = X.shape[1]
        if self.encode_data:
            self.unique_values = [
                values.shape[0] for values in self.feature_encoder_.categories_
            ]
        else:
            self.unique_values = [
                np.unique(X[:, j]).shape[0] for j in range(X.shape[1])
            ]
        random.seed(self.seed)
        np.random.seed(self.seed)
        self.size = np.ceil(np.sqrt(X.shape[1]))
        best_individual = self.execute_algorithm(X, y)
        self.best_features = best_individual
        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)
        return self
Пример #2
0
def test_custom_ordinal_time_comparison(X=None, iterations=10, verbose=1):
    if not X:
        X = np.array([
            ["P", "+"],
            ["P2", "-"],
            ["P3", "-"],
        ])

    custom_encoder = CustomOrdinalFeatureEncoder()
    ordinal_encoder = OrdinalEncoder()

    ordinal_encoder_time = []
    custom_encoder_time = []
    for i in range(iterations):
        ts = time()
        custom_encoder.fit(X)
        transformed = custom_encoder.transform(X)
        custom_encoder.inverse_transform(transformed)
        custom_encoder_time.append(time() - ts)

        ts = time()
        ordinal_encoder.fit(X)
        transformed = ordinal_encoder.transform(X)
        ordinal_encoder.inverse_transform(transformed)
        ordinal_encoder_time.append(time() - ts)
    custom_encoder_time = np.mean(custom_encoder_time)
    ordinal_encoder_time = np.mean(ordinal_encoder_time)
    if verbose:
        print(f"CustomEncoder -> Time: {custom_encoder_time}")
        print(f"OrdinalEncoder -> Time: {ordinal_encoder_time}")
    return custom_encoder_time, ordinal_encoder_time
Пример #3
0
def test_incremental_validation(X=None, y=None, iterations=10, verbose=1):
    if not X:
        X, y = make_classification(n_samples=500,
                                   n_features=1000,
                                   n_informative=20,
                                   n_redundant=1,
                                   n_repeated=0,
                                   n_classes=2,
                                   n_clusters_per_class=2,
                                   weights=None,
                                   class_sep=1,
                                   hypercube=False,
                                   scale=1.0,
                                   shuffle=True,
                                   random_state=0)
    X //= 10  # --> To be able to evaluate categoricalNB

    # classifiers
    nb_classifier = NaiveBayes(encode_data=True)
    nb_classifier_no_encoding = NaiveBayes(encode_data=False)
    custom_encoder = CustomOrdinalFeatureEncoder()
    cnb = CategoricalNB()

    # accumulators
    categorical_nb = []
    custom_nb_val_1 = []
    custom_nb_val_2 = []
    custom_nb_val_3 = []
    custom_nb_val_4 = []
    for i in range(iterations):
        if verbose:
            print(f"Iteration {i}")
        ts = time()
        X2 = custom_encoder.fit_transform(X)

        ts = time()
        score_2 = nb_classifier.leave_one_out_cross_val(X, y)
        custom_nb_val_1.append(time() - ts)

        ts = time()
        score_4 = cross_leave_one_out(nb_classifier, X, y)
        custom_nb_val_3.append(time() - ts)

        ts = time()
        X2 = custom_encoder.fit_transform(X)
        score_5 = cross_leave_one_out(nb_classifier_no_encoding, X2, y)
        custom_nb_val_4.append(time() - ts)

        if i == 0:
            score_1 = score_2
            scores = [score_1, score_2, score_4, score_5]
            assert all(score == scores[0] for score in scores)
    print("Categorical with scikit loo: ", np.mean(categorical_nb[1:]))
    print("Custom with scikit loo: ", np.mean(custom_nb_val_3[1:]))
    print("Custom with scikit loo (pre-encoding): ",
          np.mean(custom_nb_val_4[1:]))
    print("Custom with first incremental: ", np.mean(custom_nb_val_1[1:]))
Пример #4
0
    def fit(self, X: np.ndarray, y: np.ndarray):
        """ Fits the classifier with trainning data.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features_)
            Training array that must be encoded unless
            encode_data is set to True

        y : array-like of shape (n_samples,)
            Label of the class associated to each sample.

        Returns
        -------
        self : object
        """
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals, discretize=self.discretize)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)
        if X.dtype != int:
            X = X.astype(int)
        if y.dtype != int:
            y = y.astype(int)
        self.n_samples_, self.n_features_ = X.shape
        self._compute_class_counts(X, y)
        self._compute_feature_counts(X, y)
        self._compute_independent_terms()
        return self
Пример #5
0
def acfs_score_comparison(datasets,
                          seed,
                          base_path,
                          params,
                          n_splits=3,
                          n_repeats=5,
                          n_intervals=5,
                          metric="accuracy",
                          send_email=False,
                          email_data=dict(),
                          verbose=True):
    # List to store results and column names for the csv
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "ACFCS Score", "ACFCS Score STD", "Configuration", "Nodes",
        "Contruction Matrix", "Selection Matrix", "Selected_attributes",
        "Original"
    ]
    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    acfcs = ACFCS(verbose=0, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        # Update progressbar
        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selection_matrix = np.zeros(shape=(len(params),
                                                 n_splits * n_repeats))
        acfcs_construction_matrix = np.zeros(shape=(len(params),
                                                    n_splits * n_repeats))
        acfcs_nodes = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))
        acfcs_selected = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)
            acfcs.reset_cache()
            for conf_index, conf in enumerate(params):
                acfcs.set_params(**conf)
                acfcs.fit(X_train, y_train, init_graph=conf_index == 0)

                # score
                acfcs_score_conf = acfcs.score(X_test, y_test)
                if verbose:
                    seed_tqdm.set_postfix({
                        "config": conf_index,
                        "nb_score": naive_bayes_score,
                        "ant_score": acfcs_score_conf
                    })

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            acfcs.best_features)))
                n_selected = len(acfcs.best_features)
                selection_matrix = len(acfcs.afg.pheromone_selection)
                construction_matrix = len(acfcs.afg.pheromone_construction)
                nodes = len(acfcs.afg.nodes)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                acfcs_score[conf_index, i] = acfcs_score_conf
                acfcs_selection_matrix[conf_index, i] = selection_matrix
                acfcs_construction_matrix[conf_index, i] = construction_matrix
                acfcs_nodes[conf_index, i] = nodes
                acfcs_dummy[conf_index, i] = n_original_features
                acfcs_selected[conf_index, i] = n_selected

        # Insert the final result - averaged metrics for this database.
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(acfcs_score[conf_index]),
                np.std(acfcs_score[conf_index]), conf,
                np.mean(acfcs_nodes[conf_index]),
                np.mean(acfcs_construction_matrix[conf_index]),
                np.mean(acfcs_selection_matrix[conf_index]),
                np.mean(acfcs_selected[conf_index]),
                np.mean(acfcs_dummy[conf_index])
            ]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)

    if send_email:
        from tfg.utils import send_results
        send_results("ACFCS", email_data, result)
    return result
Пример #6
0
    def fit(self, X, y):
        # Parse input
        if isinstance(y, pd.DataFrame):
            y = y.to_numpy()
        if self.encode_data:
            self.feature_encoder_ = CustomOrdinalFeatureEncoder(
                n_intervals=self.n_intervals)
            self.class_encoder_ = CustomLabelEncoder()
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        check_X_y(X, y)

        # Reset the stored results for new fit
        self.reset_evaluation()

        # Generate rank
        if self.use_graph:
            # Construct the minimum graph and create rank
            graph = AntFeatureGraphMI(seed=None, connections=1).compute_graph(
                X, y, ("AND", "OR", "XOR"))
            self.all_feature_constructors = graph.get_rank()
        elif self.prune is not None:
            # Construct the rank with pruning by selecting pais that maximise SU(X_iX_j,Y)
            feature_combinations = list(
                combinations(list(range(X.shape[1])),
                             2)) + [(i, i) for i in range(X.shape[1])]
            rank_pairs = [
                symmetrical_uncertainty_two_variables(X[:, i], X[:, j], y)
                for i, j in feature_combinations
            ]
            rank_pairs_index = np.argsort(rank_pairs)[::-1]

            # Create the unsorted list
            self.all_feature_constructors = []
            for index in rank_pairs_index[:self.prune]:
                i, j = feature_combinations[index]
                if i == j:
                    from tfg.feature_construction import create_feature
                    self.all_feature_constructors.extend([
                        create_feature("OR", [(i, n), (i, m)])
                        for n, m in combinations(np.unique(X[:, i]), 2)
                    ])
                else:
                    self.all_feature_constructors.extend(
                        construct_features(X[:, [i, j]],
                                           operators=self.operators,
                                           same_feature=False))
        else:
            # Create the unsorted list of all features
            self.all_feature_constructors = construct_features(
                X, operators=self.operators)
        if self.verbose:
            print(
                f"Total number of constructed features: {len(self.all_feature_constructors)}"
            )
        self.all_feature_constructors.extend(
            [DummyFeatureConstructor(j) for j in range(X.shape[1])])
        self.symmetrical_uncertainty_rank = []

        # Sort the ranking
        for feature_constructor in self.all_feature_constructors:
            feature = feature_constructor.transform(X)
            su = symmetrical_uncertainty(f1=feature.flatten(), f2=y)
            self.symmetrical_uncertainty_rank.append(su)

        # Store the descending order index
        self.rank = np.argsort(self.symmetrical_uncertainty_rank)[::-1]

        # If the initial variables are
        if self.use_initials:
            classifier = NaiveBayes(encode_data=False,
                                    n_intervals=self.n_intervals,
                                    metric=self.metric)
            classifier.fit(X, y)
            current_features = [
                DummyFeatureConstructor(j) for j in range(X.shape[1])
            ]

            # Store the backward result to reuse it for other executions
            self.initial_backward_features = backward_search(
                X, y, current_features, classifier)

        # Feature Subset Selection (FSS) from the rank
        self.filter_features(X, y)
        return self
Пример #7
0
def ranker_score_comparison(datasets,
                            seed,
                            base_path,
                            params,
                            n_splits=3,
                            n_repeats=5,
                            n_intervals=5,
                            metric="accuracy",
                            send_email=False,
                            email_data=dict(),
                            share_rank=True):
    result = []
    columns = ["Database",
               "Number of attributes",
               "NBScore",
               "NBScore STD",
               "Ranker Score",
               "Ranker Score STD",
               "Configuration",
               "Combinations",
               "Selected_attributes",
               "Original"]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    r = RankerLogicalFeatureConstructor(n_intervals=n_intervals, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_score = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_combinations = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_dummy = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_constructed = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_total_selected = np.zeros(shape=(len(params), n_splits*n_repeats))
        r_original_selected = np.zeros(shape=(len(params), n_splits*n_repeats))

        rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits*n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')

        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers
            nb.fit(X=X_train, y=y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            for conf_index, conf in enumerate(params):
                seed_tqdm.set_postfix({"config": conf_index})
                r.set_params(**conf)
                # Fit
                if conf_index == 0 or not share_rank:
                    # The rank is computed from scratch
                    r.fit(X_train, y_train)
                else:
                    r.filter_features(r.feature_encoder_.transform(
                        X_train), r.class_encoder_.transform(y_train))

                # score
                ranker_score = r.score(X_test, y_test)

                # Get data
                n_original_features = len(list(filter(lambda x: isinstance(
                    x, DummyFeatureConstructor), r.final_feature_constructors)))
                n_combinations = len(r.all_feature_constructors)
                n_selected = len(r.final_feature_constructors)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                r_score[conf_index, i] = ranker_score
                r_combinations[conf_index, i] = n_combinations
                r_selected[conf_index, i] = n_selected
                r_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this dataset
        for conf_index, conf in enumerate(params):
            row = [name,
                   X.shape[1],
                   np.mean(nb_score[conf_index]),
                   np.std(nb_score[conf_index]),
                   np.mean(r_score[conf_index]),
                   np.std(r_score[conf_index]),
                   conf,
                   np.mean(r_combinations[conf_index]),
                   np.mean(r_selected[conf_index]),
                   np.mean(r_dummy[conf_index])]
            result.append(row)
    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results("RANKER", email_data, result)
    return result
Пример #8
0
def genetic_score_comparison(datasets,
                             seed,
                             base_path,
                             params,
                             n_splits=3,
                             n_repeats=5,
                             n_intervals=5,
                             metric="accuracy",
                             send_email=False,
                             email_data=dict(),
                             verbose=True,
                             version=1):
    result = []
    columns = [
        "Database", "Number of attributes", "NBScore", "NBScore STD",
        "Genetic Score", "Genetic Score STD", "Configuration",
        "Selected_attributes", "Original"
    ]

    dataset_tqdm = tqdm(datasets)

    # Instantiate the classifier
    if version == 1:
        # First Version - No flexibility in the number of attributes (bad performance)
        # clf = GeneticProgramming(seed=seed, metric=metric)
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    elif version == 2:
        # Version with flexibility
        clf = GeneticProgrammingFlexibleLogic(seed=seed, metric=metric)
    else:
        # Guided mutation based on SU
        clf = GeneticProgrammingRankMutation(seed=seed, metric=metric)
    nb = NaiveBayes(encode_data=False, n_intervals=n_intervals, metric=metric)

    # Execute algorithm on datasets
    for database in dataset_tqdm:
        name, label = database
        if not os.path.exists(base_path + name):
            print(f"{name} doesnt' exist")
            continue
        # Assume UCI REPO like data
        test = f"{name}.test.csv"
        data = f"{name}.data.csv"
        X, y = get_X_y_from_database(base_path, name, data, test, label)

        dataset_tqdm.set_postfix({"DATABASE": name})

        # Set up data structures to store results
        nb_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_score = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_selected = np.zeros(shape=(len(params), n_splits * n_repeats))
        clf_dummy = np.zeros(shape=(len(params), n_splits * n_repeats))

        # Create splits for the experiments
        rskf = RepeatedStratifiedKFold(n_splits=n_splits,
                                       n_repeats=n_repeats,
                                       random_state=seed)
        seed_tqdm = tqdm(rskf.split(X, y),
                         leave=False,
                         total=n_splits * n_repeats,
                         bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'
                         ) if verbose else rskf.split(X, y)

        # Execute experiments
        for i, data in enumerate(seed_tqdm):
            train_index, test_index = data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Encode the data
            c = CustomOrdinalFeatureEncoder(n_intervals=n_intervals)
            X_train = c.fit_transform(X_train)
            X_test = c.transform(X_test)
            l = CustomLabelEncoder()
            y_train = l.fit_transform(y_train)
            y_test = l.transform(y_test)

            # Assess the classifiers reusing info to speed up evaluation
            nb.fit(X_train, y_train)
            naive_bayes_score = nb.score(X_test, y_test)

            # Reset evaluation-cache for new split
            clf.reset_evaluation()
            for conf_index, conf in enumerate(params):
                if verbose:
                    seed_tqdm.set_postfix({"config": conf_index})
                clf.set_params(**conf)
                clf.fit(X_train, y_train)

                # score
                genetic_score = clf.score(X_test, y_test)

                # Get data
                n_original_features = len(
                    list(
                        filter(
                            lambda x: isinstance(x, DummyFeatureConstructor),
                            clf.best_features)))
                n_selected = len(clf.best_features)

                # Update
                nb_score[conf_index, i] = naive_bayes_score
                clf_score[conf_index, i] = genetic_score
                clf_selected[conf_index, i] = n_selected
                clf_dummy[conf_index, i] = n_original_features

        # Insert to final result averaged metrics for this database
        for conf_index, conf in enumerate(params):
            row = [
                name, X.shape[1],
                np.mean(nb_score[conf_index]),
                np.std(nb_score[conf_index]),
                np.mean(clf_score[conf_index]),
                np.std(clf_score[conf_index]), conf,
                np.mean(clf_selected[conf_index]),
                np.mean(clf_dummy[conf_index])
            ]
            result.append(row)

    result = pd.DataFrame(result, columns=columns)
    if send_email:
        from tfg.utils import send_results
        send_results(f"GENETIC_{version}", email_data, result)
    return result
Пример #9
0
    def fit(self, X, y, init_graph=True):
        self.feature_encoder_ = CustomOrdinalFeatureEncoder()
        self.class_encoder_ = CustomLabelEncoder()

        self.categories_ = None
        if isinstance(X, pd.DataFrame):
            self.categories_ = X.columns
        if self.encode_data:
            X = self.feature_encoder_.fit_transform(X)
            y = self.class_encoder_.fit_transform(y)
        if init_graph:
            if self.graph_strategy == "full":
                #Full graph
                self.afg = AntFeatureGraph(seed=self.seed).compute_graph(
                    X, y, ("XOR", "OR", "AND"))
            else:
                #Pruned graph
                self.afg = AntFeatureGraphMI(
                    seed=self.seed,
                    connections=self.connections).compute_graph(
                        X, y, ("XOR", "OR", "AND"))
        else:
            self.afg.reset_pheromones()
        if self.verbose:
            print(f"Number of nodes: {len(self.afg.nodes)}")

        random.seed(self.seed)
        best_score = 0
        self.best_features = []
        iterations_without_improvement = 0
        iterator = tqdm(range(self.iterations)) if self.verbose else range(
            self.iterations)
        beta = self.beta
        distance_from_best = -1
        for iteration in iterator:
            if self.verbose:
                iterator.set_postfix({
                    "best_score":
                    best_score,
                    "n_features":
                    len(self.best_features),
                    "p_matrix_c":
                    len(self.afg.pheromone_construction),
                    "p_matrix_s":
                    len(self.afg.pheromone_selection),
                    "distance_from_best":
                    distance_from_best
                })
            ants = [
                Ant(ant_id=i,
                    alpha=self.alpha,
                    beta=beta,
                    metric=self.metric,
                    use_initials=self.use_initials,
                    cache_loo=self.cache_loo,
                    cache_heuristic=self.cache_heuristic,
                    step=self.step) for i in range(self.ants)
            ]
            beta *= (1 - self.beta_evaporation_rate)
            results = []
            for ant in ants:
                results.append(
                    ant.run(X=X,
                            y=y,
                            graph=self.afg,
                            random_generator=random,
                            parallel=self.parallel,
                            max_errors=self.max_errors))
            results = np.array(results)

            self.afg.update_pheromone_matrix_evaporation(self.evaporation_rate)
            distance_from_best = np.mean(np.abs(results - best_score))
            best_ant = np.argmax(results)
            if self.update_strategy == "best":
                ant = ants[best_ant]
                self.afg.intensify(ant.current_features,
                                   self.intensification_factor, 1,
                                   self.use_initials)
            else:
                for ant_score, ant in zip(results, ants):
                    self.afg.intensify(ant.current_features,
                                       self.intensification_factor, ant_score,
                                       self.use_initials)

            if results[best_ant] >= best_score:
                iterations_without_improvement = 0
                ant = ants[best_ant]
                best_score = results[best_ant]
                self.best_features = ant.current_features
            else:
                iterations_without_improvement += 1
                if iterations_without_improvement > self.early_stopping:
                    break

        self.classifier_ = NaiveBayes(encode_data=False, metric=self.metric)
        if self.final_selection == "BEST":
            pass
        else:
            #An ant traverses the graph deterministically to obtain the features
            final_ant = FinalAnt(ant_id=0,
                                 alpha=self.alpha,
                                 beta=beta,
                                 metric=self.metric,
                                 use_initials=self.use_initials,
                                 cache_loo=self.cache_loo,
                                 cache_heuristic=self.cache_heuristic,
                                 step=self.step)
            final_ant.run(X=X,
                          y=y,
                          graph=self.afg,
                          random_generator=random,
                          parallel=self.parallel)
            self.best_features = final_ant.current_features
        #Train model with final features
        self.classifier_.fit(
            np.concatenate(
                [feature.transform(X) for feature in self.best_features],
                axis=1), y)

        if self.save_features:
            #Save to features to dict
            translate_features(features=self.best_features,
                               feature_encoder=self.feature_encoder_,
                               categories=self.categories_,
                               path=self.path,
                               filename=self.filename)
        return self
Пример #10
0
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30):
    column_names = ["dataset",
                    "custom_training_score",
                    "custom_test_score",
                    "categorical_training_score",
                    "categorical_test_score"]
    data =[]
    clf_no_encoding = NaiveBayes(encode_data=True)
    clf_categorical_sklearn = CategoricalNB()
    
    datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')
    c = CustomOrdinalFeatureEncoder()
    l = CustomLabelEncoder()
    
    for dataset in datasets_iter:
        dataset_name, label = dataset
        data_filename = f"{dataset_name}.data.csv"
        test_filename = f"{dataset_name}.test.csv"
        X, y = get_X_y_from_database(base_path=base_path,
                                     name = dataset_name,
                                     data = data_filename, 
                                     test = test_filename, 
                                     label = label)
        custom_train = []
        custom_test = []

        sklearn_train = []
        sklearn_test = []


        X  = c.fit_transform(X)
        y  = l.fit_transform(y)
        for iteration in range(n_iterations):
            if verbose:
                datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration})
                datasets_iter.refresh()
            try:
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                             test_size=test_size,
                                                             random_state=seed+iteration,
                                                             shuffle=True,
                                                             stratify=y)
            except:
                #Not enough values to stratify y
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                                test_size=test_size,
                                                                random_state=seed+iteration,
                                                                shuffle=True
                                                                )
            #Fit
            clf_no_encoding.fit(X_train,y_train)
            clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])]
            clf_categorical_sklearn.fit(X_train,y_train)
            
            
            #Predict
            custom_train.append(clf_no_encoding.score(X_train,y_train))
            custom_test.append(clf_no_encoding.score(X_test,y_test))
            sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train))
            sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test))
        data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)])
    return pd.DataFrame(data,columns = column_names)