예제 #1
0
def test_opf_accuracy():
    labels = [1, 1, 2, 2]
    preds = [1, 1, 1, 1]

    acc = general.opf_accuracy(labels, preds)

    assert acc == 0.5
예제 #2
0
    def prune(self, X_train, Y_train, X_val, Y_val, n_iterations=10):
        """Prunes a classifier over a validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            n_iterations (int): Maximum number of iterations.

        """

        logger.info('Pruning classifier ...')

        # Fits training data into the classifier
        self.fit(X_train, Y_train)

        # Predicts new data
        self.predict(X_val)

        # Gathering initial number of nodes
        initial_nodes = self.subgraph.n_nodes

        # For every possible iteration
        for t in range(n_iterations):
            logger.info('Running iteration %d/%d ...', t + 1, n_iterations)

            # Creating temporary lists
            X_temp, Y_temp = [], []

            # Removing irrelevant nodes
            for j, n in enumerate(self.subgraph.nodes):
                if n.relevant != c.IRRELEVANT:
                    X_temp.append(X_train[j, :])
                    Y_temp.append(Y_train[j])

            # Copying lists back to original data
            X_train = np.asarray(X_temp)
            Y_train = np.asarray(Y_temp)

            # Fits training data into the classifier
            self.fit(X_train, Y_train)

            # Predicts new data
            preds = self.predict(X_val)

            # Calculating accuracy
            acc = g.opf_accuracy(Y_val, preds)

            logger.info('Current accuracy: %s.', acc)

        # Gathering final number of nodes
        final_nodes = self.subgraph.n_nodes

        # Calculating pruning ratio
        prune_ratio = 1 - final_nodes / initial_nodes

        logger.info('Prune ratio: %s.', prune_ratio)
예제 #3
0
def supervised_opf_feature_selection(opytimizer):
    # Gathers features
    features = opytimizer[:, 0].astype(bool)

    # Remaking training and validation subgraphs with selected features
    X_train_selected = X_train[:, features]
    X_val_selected = X_val[:, features]

    # Creates a SupervisedOPF instance
    opf = SupervisedOPF(distance='log_squared_euclidean',
                        pre_computed_distance=None)

    # Fits training data into the classifier
    opf.fit(X_train_selected, Y_train)

    # Predicts new data
    preds = opf.predict(X_val_selected)

    # Calculates accuracy
    acc = g.opf_accuracy(Y_val, preds)

    return 1 - acc
예제 #4
0
def unsupervised_opf_clustering(opytimizer):
    # Gathers parameters from Opytimizer
    # Pay extremely attention to their order when declaring due to their bounds
    max_k = int(opytimizer[0][0])

    # Creates an UnsupervisedOPF instance
    opf = UnsupervisedOPF(max_k=max_k,
                          distance='log_squared_euclidean',
                          pre_computed_distance=None)

    # Fits training data into the classifier
    opf.fit(X_train, Y_train)

    # If data is labeled, one can propagate predicted labels instead of only the cluster identifiers
    opf.propagate_labels()

    # Predicts new data
    preds, _ = opf.predict(X_test)

    # Calculates accuracy
    acc = g.opf_accuracy(Y_test, preds)

    return 1 - acc
예제 #5
0
X_train, X_val, Y_train, Y_val = s.split(X, Y, percentage=0.5, random_state=1)

# Creates a always true loop
while True:
    # Creates a SupervisedOPF instance
    opf = SupervisedOPF(distance='log_squared_euclidean',
                        pre_computed_distance=None)

    # Fits training data into the classifier
    opf.fit(X_train, Y_train)

    # Predicts new data
    preds = opf.predict(X_val)

    # Calculating accuracy
    acc = g.opf_accuracy(Y_val, preds)

    print(f'Accuracy: {acc}')

    # Gathers which samples were missclassified
    errors = np.argwhere(Y_val != preds)

    # If there are no missclassified samples
    if len(errors) == 0:
        # Breaks the process
        break

    # For every wrong classified sample
    for e in errors:
        # Adds the sample to the training set
        X_train = np.vstack((X_train, X_val[e, :]))
예제 #6
0
    def _learn(self, X_train, Y_train, I_train, X_val, Y_val, I_val):
        """Learns the best `k` value over the validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            I_train (np.array): Array of training indexes.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            I_val (np.array): Array of validation indexes.

        """

        logger.info('Learning best `k` value ...')

        # Creating a subgraph
        self.subgraph = KNNSubgraph(X_train, Y_train, I_train)

        if self.pre_computed_distance:
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Defining initial maximum accuracy as 0
        max_acc = 0.0

        for k in range(1, self.max_k + 1):
            # Gathers current `k` as subgraph's best `k`
            self.subgraph.best_k = k

            # Calculate the arcs using the current `k` value
            self.subgraph.create_arcs(k, self.distance_fn,
                                      self.pre_computed_distance,
                                      self.pre_distances)

            # Calculate the p.d.f. using the current `k` value
            self.subgraph.calculate_pdf(k, self.distance_fn,
                                        self.pre_computed_distance,
                                        self.pre_distances)

            # Clusters the subgraph
            self._clustering()

            # Calculate the predictions over the validation set
            preds = self.predict(X_val, I_val)

            # Calculating the accuracy
            acc = g.opf_accuracy(Y_val, preds)

            if acc > max_acc:
                max_acc = acc
                best_k = k

            logger.info('Accuracy over k = %d: %s', k, acc)

            self.subgraph.destroy_arcs()

        self.subgraph.best_k = best_k
            y_val_opf = y_val_opf + 1
            y_test_opf = y_test + 1

            clf.learn(X_train_opf,
                      y_train_opf,
                      X_val_opf,
                      y_val_opf,
                      n_iterations=20)

        else:
            clf.fit(X_train, y_train)

        if name == "OPF":
            preds = clf.predict(X_test)

            acc = g.opf_accuracy(y_test_opf, preds)
            score = acc
            print(score)
            print(accuracy_score(y_test_opf, preds))
        else:
            score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].

        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(Xfull)
        elif name == "OPF":
            predsopf = clf.predict(Xfull)
            predsopf = np.asarray(predsopf)
            predsopf = predsopf - 1
예제 #8
0
# Parsing a pre-loaded numpy array
X, Y = p.parse_loader(txt)

# Splitting data into training and testing sets
X_train, X_test, Y_train, Y_test = s.split(X,
                                           Y,
                                           percentage=0.8,
                                           random_state=1)

#Splitting data into training and validation sets
X_train, X_unlabeled, Y_train, Y_unlabeled = s.split(X_train,
                                                     Y_train,
                                                     percentage=0.25,
                                                     random_state=1)

# Creates a SemiSupervisedOPF instance
opf = SemiSupervisedOPF(distance='log_squared_euclidean',
                        pre_computed_distance=None)

# Fits training data along with unlabeled data into the semi-supervised classifier
opf.fit(X_train, Y_train, X_unlabeled)

# Predicts new data
preds = opf.predict(X_test)

# Calculating accuracy
acc = g.opf_accuracy(Y_test, preds)

print(f'Accuracy: {acc}')
예제 #9
0
    def learn(self, X_train, Y_train, X_val, Y_val, n_iterations=10):
        """Learns the best classifier over a validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            n_iterations (int): Number of iterations.

        """

        logger.info('Learning the best classifier ...')

        # Defines the maximum accuracy
        max_acc = 0

        # Defines the previous accuracy
        previous_acc = 0

        # Defines the iterations counter
        t = 0

        while True:
            logger.info('Running iteration %d/%d ...', t+1, n_iterations)

            # Fits training data into the classifier
            self.fit(X_train, Y_train)

            # Predicts new data
            preds = self.predict(X_val)

            # Calculating accuracy
            acc = g.opf_accuracy(Y_val, preds)

            if acc > max_acc:
                max_acc = acc

                best_opf = copy.deepcopy(self)

                # Saves the iteration number
                best_t = t

            # Gathers which samples were missclassified
            errors = np.argwhere(Y_val != preds)

            # Defining the initial number of non-prototypes as 0
            non_prototypes = 0

            for n in self.subgraph.nodes:
                if n.status != c.PROTOTYPE:
                    non_prototypes += 1

            for err in errors:
                # Counter will receive the number of non-prototypes
                ctr = non_prototypes

                # While the counter is bigger than zero
                while ctr > 0:
                    # Generates a random index
                    j = int(r.generate_uniform_random_number(0, len(X_train)))

                    # If the node on that particular index is not a prototype
                    if self.subgraph.nodes[j].status != c.PROTOTYPE:
                        # Swap the input nodes
                        X_train[j, :], X_val[err, :] = X_val[err, :], X_train[j, :]

                        # Swap the target nodes
                        Y_train[j], Y_val[err] = Y_val[err], Y_train[j]

                        # Decrements the number of non-prototypes
                        non_prototypes -= 1

                        # Resets the counter
                        ctr = 0

                    # If the node on that particular index is a prototype
                    else:
                        # Decrements the counter
                        ctr -= 1

            # Calculating difference between current accuracy and previous one
            delta = np.fabs(acc - previous_acc)

            # Replacing the previous accuracy as current accuracy
            previous_acc = acc

            # Incrementing the counter
            t += 1

            logger.info('Accuracy: %s | Delta: %s | Maximum Accuracy: %s', acc, delta, max_acc)

            # If the difference is smaller than 10e-4 or iterations are finished
            if delta < 0.0001 or t == n_iterations:
                # Replaces current class with the best OPF
                self = best_opf

                logger.info('Best classifier has been learned over iteration %d.', best_t+1)

                break
예제 #10
0
    def _learn(self, X_train, Y_train, X_val, Y_val):
        """Learns the best `k` value over the validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.

        """

        logger.info('Learning best `k` value ...')

        # Creating a subgraph
        self.subgraph = KNNSubgraph(X_train, Y_train)

        # Checks if it is supposed to use pre-computed distances
        if self.pre_computed_distance:
            # Checks if its size is the same as the subgraph's amount of nodes
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                # If not, raises an error
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Defining initial maximum accuracy as 0
        max_acc = 0.0

        # For every possible `k` value
        for k in range(1, self.max_k + 1):
            # Gathers current `k` as subgraph's best `k`
            self.subgraph.best_k = k

            # Calculate the arcs using the current `k` value
            self.subgraph.create_arcs(k, self.distance_fn,
                                      self.pre_computed_distance,
                                      self.pre_distances)

            # Calculate the p.d.f. using the current `k` value
            self.subgraph.calculate_pdf(k, self.distance_fn,
                                        self.pre_computed_distance,
                                        self.pre_distances)

            # Clusters the subgraph
            self._clustering()

            # Calculate the predictions over the validation set
            preds = self.predict(X_val)

            # Calculating the accuracy
            acc = g.opf_accuracy(Y_val, preds)

            # If accuracy is better than maximum accuracy
            if acc > max_acc:
                # Replaces the maximum accuracy value
                max_acc = acc

                # Defines current `k` as the best `k` value
                best_k = k

            logger.info(f'Accuracy over k = {k}: {acc}')

            # Destroy the arcs
            self.subgraph.destroy_arcs()

        # Applying the best k to the subgraph's property
        self.subgraph.best_k = best_k
예제 #11
0
import numpy as np

import opfython.math.general as g

# Defining array, labels and predictions
array = np.asarray([1.5, 2, 0.5, 1.25, 1.75, 3])
labels = [0, 0, 0, 1, 1, 1, 2]
preds = [0, 0, 1, 1, 0, 1, 2]

# Normalizing the array
norm_array = g.normalize(array)
print(norm_array)

# Calculating the confusion matrix
c_matrix = g.confusion_matrix(labels, preds)
print(c_matrix)

# Calculating OPF-like accuracy
opf_acc = g.opf_accuracy(labels, preds)
print(opf_acc)

# Calculating OPF-like accuracy per label
opf_acc_per_label = g.opf_accuracy_per_label(labels, preds)
print(opf_acc_per_label)

# Calculating purity measure
purity = g.purity(labels, preds)
print(purity)
예제 #12
0
    def _learn(self, X_train, Y_train, I_train, X_val, Y_val, I_val):
        """Learns the best `k` value over the validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            I_train (np.array): Array of training indexes.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            I_val (np.array): Array of validation indexes.

        """

        logger.info('Learning best `k` value ...')

        # Creating a subgraph
        self.subgraph = ANNSubgraph(X_train, Y_train, I_train)

        # Defining initial maximum accuracy as 0
        max_acc = 0.

        best_k = 1

        # For every possible `k` value
        for k in range(1, self.max_k + 1):
            # Gathers current `k` as subgraph's best `k`
            self.subgraph.best_k = k

            # Initiating the ANN method to perform Approximate Nearest Neighbors search
            if self.ann_params.get('name') == 'hnsw':
                self.ann_params['ef'] = k

            self.ann_search = self.ann_class(self.ann_params)

            # Build the ANN index
            self.ann_search.fit(X_train)

            # Calculate the arcs using the current `k` value
            self.subgraph.build_arcs(k, self.ann_search)

            # Calculate the p.d.f. using the current `k` value
            # self.subgraph.calculate_pdf(
            #     k, self.distance_fn, self.pre_computed_distance, self.pre_distances)
            self.subgraph.calc_pdf(k, self.distance_fn)

            # Clusters the subgraph
            self._clustering()

            # Calculate the predictions over the validation set
            preds = self.predict(X_val, I_val)

            # Calculating the accuracy
            acc = g.opf_accuracy(Y_val, preds)

            # If accuracy is better than maximum accuracy
            if acc > max_acc:
                # Replaces the maximum accuracy value
                max_acc = acc

                # Defines current `k` as the best `k` value
                best_k = k

            logger.info('Accuracy over k = %d: %s', k, acc)

            # Destroy the arcs
            self.subgraph.destroy_arcs()

        # Applying the best k to the subgraph's property
        self.subgraph.best_k = best_k
예제 #13
0
    def run(self, technique, easy_X, easy_Y, hard_X, hard_Y, test_X, test_Y,
            k_hardSamples, k_easySamples, iteration):
        # ---------- PIPELINE ----------
        print("")
        print("Iniciando Pipeline..")
        print("")

        # final variables
        ssmodel_accuracy = []
        ssmodel_corrected = []
        fullmodel_accuracy = []
        fullmodel_corrected = []
        hard_time_to_select = []
        easy_time_to_select = []
        ssmodel_knowClass = []
        fullmodel_knowClass = []
        wrong_percentage = []
        easy_X_bkp = easy_X
        easy_Y_bkp = easy_Y
        hard_X_bkp = hard_X
        hard_Y_bkp = hard_Y

        # For to iterate over scenarios
        for (method_hard, method_easy) in zip([technique] * 2,
                                              ["Random", technique]):

            # for variables
            ss_model_score = []
            ss_model_corrected = []
            full_model_score = []
            full_model_corrected = []
            hardTimeToSelect = []
            easyTimeToSelect = []
            ss_know_class = []
            full_know_class = []
            wrongPercentages = []

            # Recover complete dataset
            easy_X = easy_X_bkp
            easy_Y = easy_Y_bkp
            hard_X = hard_X_bkp
            hard_Y = hard_Y_bkp
            print(
                "Métodos de Aprendizado Ativo: Dataset Hard {} / Dataset Easy {}"
                .format(method_hard, method_easy))
            print("")

            # For to control iterations number
            for i in range(0, iteration):
                print("===== Iteração {} =====".format(i + 1))
                print("")

                # Selecting samples with Active Learning from Hard Dataset
                timeToSelect_hard, selected_hard_X, selected_hard_Y, hard_X, hard_Y, hard_correctedLabels = self.selectSamples(
                    method_hard,
                    hard_X,
                    hard_Y,
                    k_hardSamples * 2 if i == 0 else k_hardSamples,
                    True if i == 0 else False,
                    ("none" if i == 0 else learner_hard)
                    if method_hard != "Random" else
                    ("none" if i == 0 else ssmodel),
                )

                # Append True Labeled Data to the Labeled Data
                if i == 0:
                    labeled_X = selected_hard_X
                    labeled_Y = selected_hard_Y
                else:
                    labeled_X = np.vstack((labeled_X, selected_hard_X))
                    labeled_Y = np.vstack((labeled_Y, selected_hard_Y))
                print(
                    "Samples Labeled: {} - Time to Select: {} - Corrected: {}".
                    format(len(labeled_Y), timeToSelect_hard,
                           hard_correctedLabels))

                # Learner Object to apply into hard pool
                learner_hard = self.createLearner(
                    method_hard, labeled_X, labeled_Y,
                    "none" if i == 0 else learner_hard,
                    True if i == 0 else False)
                learner_easy = self.createLearner(
                    method_easy, labeled_X, labeled_Y,
                    "none" if i == 0 else learner_easy,
                    True if i == 0 else False)

                # selecting samples with Active Learning from Easy Dataset
                timeToSelect_easy, selected_easy_X, selected_easy_Y, easy_X, easy_Y, easy_correctedLabels = self.selectSamples(
                    method_easy, easy_X, easy_Y,
                    k_easySamples * 2 if i == 0 else k_easySamples,
                    (True if i == 0 else False) if method_easy == "Random" else
                    False, ((ssmodel if i != 0 else "none")
                            if method_easy == "Random" else learner_easy))

                # append True Labeled Data to the Unlabeled Data
                if i == 0:
                    unlabeled_X = selected_easy_X
                    unlabeled_Y = selected_easy_Y
                else:
                    unlabeled_X = np.vstack((unlabeled_X, selected_easy_X))
                    unlabeled_Y = np.vstack((unlabeled_Y, selected_easy_Y))
                print(
                    "Samples Unlabeled: {} - Time to Select: {} - Corrected: {}"
                    .format(len(unlabeled_Y), timeToSelect_easy,
                            easy_correctedLabels))

                # semi supervised classification with OPF Semi Supervised
                t = time.time()
                ssmodel = SemiSupervisedOPF(distance='log_squared_euclidean',
                                            pre_computed_distance=None)
                ssmodel.fit(labeled_X,
                            labeled_Y.flatten().astype("int"), unlabeled_X)
                print("Semi Supervised Score: {}% - Time: {}".format(
                    round(
                        g.opf_accuracy(test_Y.flatten().astype("int"),
                                       ssmodel.predict(test_X)) * 100, 2),
                    round((time.time() - t), 3)))
                ss_model_score.append(
                    round(
                        g.opf_accuracy(test_Y.flatten().astype("int"),
                                       ssmodel.predict(test_X)) * 100, 2))

                # join labeled data with unlabeled
                Z_dataset_X = np.vstack((labeled_X, unlabeled_X))
                Z_dataset_Y = np.hstack(
                    (labeled_Y.flatten(), unlabeled_Y.flatten()))

                # full supervised classification
                fullmodel = SupervisedOPF(distance='log_squared_euclidean',
                                          pre_computed_distance=None)
                fullmodel.fit(Z_dataset_X, Z_dataset_Y.flatten().astype("int"))
                print("Full Supervised Score: {}% - Time: {}".format(
                    round(
                        g.opf_accuracy(test_Y.flatten().astype("int"),
                                       fullmodel.predict(test_X)) * 100, 2),
                    round((time.time() - t), 3)))
                full_model_score.append(
                    round(
                        g.opf_accuracy(test_Y.flatten().astype("int"),
                                       fullmodel.predict(test_X)) * 100, 2))

                # Predict Semi-Supervised Labels to See how many errors are propagating
                ss_predict = ssmodel.predict(unlabeled_X)
                wrongPercentage = self.calcWrongPercentage(
                    ss_predict, unlabeled_Y)

                # List of corrected Labels by methods
                ss_model_corrected.append(hard_correctedLabels)
                full_model_corrected.append(hard_correctedLabels +
                                            easy_correctedLabels)
                # List of time's to select
                hardTimeToSelect.append(timeToSelect_hard)
                easyTimeToSelect.append(timeToSelect_easy)
                # List of known class
                ss_know_class.append(len(np.unique(labeled_Y)))
                full_know_class.append(len(np.unique(Z_dataset_Y)))
                # List of wrong percentages
                wrongPercentages.append(wrongPercentage)
                print("")

            # Append Results
            ssmodel_accuracy.append(ss_model_score)
            ssmodel_corrected.append(ss_model_corrected)
            fullmodel_accuracy.append(full_model_score)
            fullmodel_corrected.append(full_model_corrected)
            hard_time_to_select.append(hardTimeToSelect)
            easy_time_to_select.append(easyTimeToSelect)
            ssmodel_knowClass.append(ss_know_class)
            fullmodel_knowClass.append(full_know_class)
            wrong_percentage.append(wrongPercentages)

            print("===" * 25)
            print("")

        return ssmodel_accuracy, ssmodel_corrected, fullmodel_accuracy, fullmodel_corrected, \
               hard_time_to_select, easy_time_to_select, ssmodel_knowClass, fullmodel_knowClass, wrong_percentage