예제 #1
0
    def test_accuracy(self):
        performance_metrics = [Accuracy()]

        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X_class,
            Y=self.__y_class,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=self.__ml_technique_class,
            performance_metrics=performance_metrics,
            query_strategy=self.__query_strategy,
            oracle=SimulatedOracle(labels=self.__y_class),
            stopping_criteria=MaxIteration(value=10),
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True
        )

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" % (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric.metric_name for metric in performance_metrics],
            method_name=self.__query_strategy.query_function_name,
            method_results=result,
            type="queries"
        )

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(title='Active Learning experiment results')
    def test_kullback_leibler_divergence(self):

        query_strategy = QueryKullbackLeiblerDivergence(n_jobs=5)

        # init the ALExperiment
        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X,
            Y=self.__y,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=self.__ml_technique,
            performance_metrics=self.__performance_metrics,
            query_strategy=query_strategy,
            oracle=SimulatedOracle(labels=self.__y),
            stopping_criteria=MaxIteration(5),
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True
        )

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" % (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric.metric_name for metric in self.__performance_metrics],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries"
        )

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(title='Active Learning experiment results')
예제 #3
0
    def test_cross_validation_randomQuery_MaxIteration(self):

        ml_technique = LogisticRegression()
        # ml_technique = BernoulliNB()
        # ml_technique = svm.SVC(kernel='rbf', probability=True)
        # ml_technique = svm.NuSVC(gamma='auto', probability=True)
        # stopping_criteria = PercentOfUnlabel(70)
        stopping_criteria = MaxIteration(25)
        # stopping_criteria = TimeLimit(2)
        # query_strategy = QueryInstanceRandom()
        query_strategy = QueryInstanceRandom()

        performance_metrics = [
            Accuracy(), F1(average='weighted'),
            HammingLoss()
        ]

        # init the ALExperiment
        experiment = CrossValidationExperiment(
            client=self.__client,
            X=self.__X,
            Y=self.__y,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=ml_technique,
            performance_metrics=performance_metrics,
            query_strategy=query_strategy,
            oracle=SimulatedOracle(labels=self.__y),
            stopping_criteria=stopping_criteria,
            self_partition=True,
            kfolds=10,
            oracle_name='SimulatedOracle',
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True,
            rebalance=True)

        results = experiment.evaluate(verbose=True,
                                      multithread=True,
                                      max_threads=10,
                                      client=self.__client)

        for result in results:
            query_analyser = ExperimentAnalyserFactory.experiment_analyser(
                performance_metrics=[
                    metric.metric_name for metric in performance_metrics
                ],
                method_name=query_strategy.query_function_name,
                method_results=result,
                type="queries")

            # get a brief description of the experiment
            query_analyser.plot_learning_curves(
                title='Active Learning experiment results')
예제 #4
0
    def test_mse(self):
        performance_metrics = [Mse(squared=False)]

        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X_reg,
            Y=self.__y_reg,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=self.__ml_technique_reg,
            performance_metrics=performance_metrics,
            query_strategy=self.__query_strategy,
            oracle=SimulatedOracle(labels=self.__y_reg),
            stopping_criteria=MaxIteration(value=20),
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True
        )

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" % (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric.metric_name for metric in performance_metrics],
            method_name=self.__query_strategy.query_function_name,
            method_results=result,
            type="queries"
        )

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(title='Active Learning experiment results')

        result = experiment.evaluate(verbose=True)
        regressor = result[0].ml_technique

        # plotting the initial estimation
        with plt.style.context('seaborn-white'):
            plt.figure(figsize=(14, 7))
            x = np.linspace(0, 20, 1000)
            pred, std = regressor.predict(x.reshape(-1, 1), return_std=True)
            plt.plot(x, pred)
            plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2)
            plt.scatter(self.__X_reg, self.__y_reg, c='k')
            plt.title('Initial estimation')
            plt.show()
예제 #5
0
    def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self):

        ml_technique = LogisticRegression()
        stopping_criteria = UnlabelSetEmpty()
        query_strategy = QueryMarginSampling()
        performance_metrics = [
            Accuracy(), F1(average='weighted'),
            HammingLoss()
        ]
        # performance_metrics = [Mse(square=False), Mse(square=True)]

        # init the ALExperiment
        experiment = HoldOutExperiment(client=self.__client,
                                       X=self.__X,
                                       Y=self.__y,
                                       scenario_type=PoolBasedSamplingScenario,
                                       ml_technique=ml_technique,
                                       performance_metrics=performance_metrics,
                                       query_strategy=query_strategy,
                                       oracle=SimulatedOracle(labels=self.__y),
                                       stopping_criteria=stopping_criteria,
                                       self_partition=True,
                                       test_ratio=0.3,
                                       initial_label_rate=0.05,
                                       all_class=False)

        result = experiment.evaluate(client=self.__client, verbose=True)

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")
        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')

        np.random.seed(0)
        indices = np.random.permutation(len(self.__X))
        iris_X_test = self.__X[indices[-10:]]
        print(result[0].ml_technique.predict(iris_X_test))
예제 #6
0
    def test_cross_validation_randomQuery_unlabelSetEmpty_singleThread(self):

        ml_technique = LogisticRegression(solver='liblinear')
        stopping_criteria = MaxIteration(50)
        query_strategy = QueryInstanceRandom()
        performance_metrics = [
            Accuracy(),
            F1(average='macro'),
            HammingLoss(),
            Precision(average='macro'),
            Recall(average='macro')
        ]

        # init the ALExperiment
        experiment = CrossValidationExperiment(
            self.__X,
            self.__y,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=ml_technique,
            performance_metrics=performance_metrics,
            query_strategy=query_strategy,
            oracle=SimulatedOracle(labels=self.__y),
            stopping_criteria=stopping_criteria,
            self_partition=True,
            kfolds=10,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True)

        results = experiment.evaluate(verbose=False)

        for result in results:
            query_analyser = ExperimentAnalyserFactory.experiment_analyser(
                performance_metrics=[
                    metric.metric_name for metric in performance_metrics
                ],
                method_name=query_strategy.query_function_name,
                method_results=result,
                type="queries")

            # get a brief description of the experiment
            query_analyser.plot_learning_curves(
                title='Active Learning experiment results')
예제 #7
0
    def test_hold_out_randomQuery_unlabelSetEmpty_ConsoleHumanOracle(self):
        ml_technique = LogisticRegression(solver='sag')
        stopping_criteria = MaxIteration(5)
        query_strategy = QueryInstanceRandom()
        performance_metrics = [
            Accuracy(), F1(average='weighted'),
            HammingLoss()
        ]

        # init the ALExperiment
        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X,
            Y=self.__y,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=ml_technique,
            performance_metrics=performance_metrics,
            query_strategy=query_strategy,
            oracle=ConsoleHumanOracle(labels=self.__y),
            stopping_criteria=stopping_criteria,
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True)

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" %
              (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')
예제 #8
0
    def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self):

        ml_technique = LogisticRegression(solver='liblinear')
        stopping_criteria = MaxIteration(50)
        query_strategy = QueryMarginSampling()
        performance_metrics = [
            Accuracy(),
            F1(average='macro'),
            HammingLoss(),
            Precision(average='macro'),
            Recall(average='macro')
        ]

        # init the ALExperiment
        experiment = HoldOutExperiment(client=None,
                                       X=self.__X.to_numpy(),
                                       Y=self.__y.to_numpy(),
                                       scenario_type=PoolBasedSamplingScenario,
                                       ml_technique=ml_technique,
                                       performance_metrics=performance_metrics,
                                       query_strategy=query_strategy,
                                       oracle=SimulatedOracle(labels=self.__y),
                                       stopping_criteria=stopping_criteria,
                                       self_partition=True,
                                       test_ratio=0.3,
                                       initial_label_rate=0.05,
                                       all_class=False)

        result = experiment.evaluate(verbose=False)

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")
        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')
예제 #9
0
    def test_ActiveLearning_HoldHout(self):

        # INI the ALExperiment -----------------------------------------------------------------------------------------
        al_ml_technique = LogisticRegression(solver='sag')
        stopping_criteria = MaxIteration(10)
        query_strategy = QueryMarginSampling()
        performance_metrics = [
            Accuracy(),
            F1(average='macro'),
            HammingLoss(),
            Precision(average='macro'),
            Recall(average='macro')
        ]

        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X.to_numpy(),
            Y=self.__y['BAD'].to_numpy(),
            scenario_type=PoolBasedSamplingScenario,
            train_idx=self.__train_idx,
            test_idx=self.__test_idx,
            label_idx=self.__label_idx,
            unlabel_idx=self.__unlabel_idx,
            ml_technique=al_ml_technique,
            performance_metrics=performance_metrics,
            query_strategy=query_strategy,
            oracle=SimulatedOracle(labels=self.__y['BAD'].to_numpy()),
            stopping_criteria=stopping_criteria,
            self_partition=False,
            rebalance=True,
            batch_size=50)

        print("")
        start_time = time.time()
        result = experiment.evaluate(verbose=True)
        print("---Active Learning experiment %s seconds ---" %
              (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")
        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')

        foldIndex = 0
        train_x = self.__X.iloc[self.__train_idx[foldIndex], :]
        train_y = self.__y.iloc[self.__train_idx[foldIndex], :]
        test_x = self.__X.iloc[self.__test_idx[foldIndex], :]
        test_y = self.__y.iloc[self.__test_idx[foldIndex], :]

        active_y_pred = result[0].ml_technique.predict(test_x)

        print("Active Learning Accuracy score : ",
              accuracy_score(test_y, active_y_pred))
        print(
            "Active Learning F1 score: ",
            f1_score(test_y, active_y_pred, average='macro', zero_division=0))
        print("Active Learning Hamming Loss",
              hamming_loss(test_y, active_y_pred))
        print(
            "Active Learning Precision score : ",
            precision_score(test_y,
                            active_y_pred,
                            average='macro',
                            zero_division=0))
        print(
            "Active Learning Recall score : ",
            recall_score(test_y,
                         active_y_pred,
                         average='macro',
                         zero_division=0))

        # END the ALExperiment -----------------------------------------------------------------------------------------

        # INI the PLExperiment -----------------------------------------------------------------------------------------
        pl_ml_technique = LogisticRegression(solver='liblinear')

        print("")
        start_time = time.time()
        pl_ml_technique.fit(train_x, train_y)
        print("---Passive Learning experiment %s seconds ---" %
              (time.time() - start_time))

        passive_y_pred = pl_ml_technique.predict(test_x)

        print("Pasive Learning Accuracy score : ",
              accuracy_score(test_y, passive_y_pred))
        print(
            "Pasive Learning F1 score: ",
            f1_score(test_y, passive_y_pred, average='macro', zero_division=0))
        print("Pasive Learning Hamming Loss",
              hamming_loss(test_y, passive_y_pred))
        print(
            "Pasive Learning Precision score : ",
            precision_score(test_y,
                            passive_y_pred,
                            average='macro',
                            zero_division=0))
        print(
            "Pasive Learning Recall score : ",
            recall_score(test_y,
                         passive_y_pred,
                         average='macro',
                         zero_division=0))
예제 #10
0
    def test_keras_digits_recognition_active_learning(self):

        # load the data - it returns 2 tuples of digits & labels - one for
        (x_train, y_train), (x_test, y_test) = mnist.load_data()

        batch_size = 1024
        num_classes = 10
        epochs = 3

        # input image dimensions
        img_rows, img_cols = 28, 28

        # display 14 random images from the training set
        np.random.seed(123)

        rand_14 = np.random.randint(0, x_train.shape[0], 14)
        sample_digits = x_train[rand_14]
        sample_labels = y_train[rand_14]
        num_rows, num_cols = 2, 7
        f, ax = plt.subplots(num_rows,
                             num_cols,
                             figsize=(12, 5),
                             gridspec_kw={
                                 'wspace': 0.03,
                                 'hspace': 0.01
                             },
                             squeeze=True)

        for r in range(num_rows):
            for c in range(num_cols):
                image_index = r * 7 + c
                ax[r, c].axis("off")
                ax[r, c].imshow(sample_digits[image_index], cmap='gray')
                ax[r, c].set_title('No. %d' % sample_labels[image_index])
        plt.show()
        plt.close()

        if K.image_data_format() == 'channels_first':
            x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
            x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
            input_shape = (1, img_rows, img_cols)
        else:
            x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
            x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
            input_shape = (img_rows, img_cols, 1)

        x_train = x_train.astype('float32')
        x_test = x_test.astype('float32')
        x_train /= 255
        x_test /= 255

        ml_technique = Sequential()
        ml_technique.add(
            Conv2D(32,
                   kernel_size=(3, 3),
                   activation='relu',
                   input_shape=input_shape))
        ml_technique.add(Conv2D(64, (3, 3), activation='relu'))
        ml_technique.add(MaxPooling2D(pool_size=(2, 2)))
        ml_technique.add(Dropout(0.25))
        ml_technique.add(Flatten())
        ml_technique.add(Dense(128, activation='relu'))
        ml_technique.add(Dropout(0.5))
        ml_technique.add(Dense(num_classes, activation='softmax'))
        ml_technique.compile(optimizer='Adam',
                             loss='categorical_crossentropy',
                             metrics=['accuracy'])

        # convert class vectors to binary class matrices
        y_train = keras.utils.to_categorical(y_train, num_classes)
        y_test = keras.utils.to_categorical(y_test, num_classes)

        X = np.concatenate((x_train, x_test))
        y = np.concatenate((y_train, y_test))

        train_idx, test_idx, label_idx, unlabel_idx = split(
            X=X,
            y=y,
            test_ratio=0.3,
            initial_label_rate=0.05,
            split_count=1,
            all_class=True)

        # convert to indexed collection
        train_idx = IndexCollection(train_idx[0])
        test_idx = IndexCollection(test_idx[0])
        label_idx = IndexCollection(label_idx[0])
        unlabel_idx = IndexCollection(unlabel_idx[0])

        # Define the active learning components
        stopping_criteria = MaxIteration(10)
        query_strategy = QueryLeastConfidentSampling()
        oracle = SimulatedOracle(labels=y)

        start_time = time.time()
        experimentState = State(
            round=0,
            train_idx=train_idx,
            test_idx=test_idx,
            init_L=label_idx,
            init_U=unlabel_idx,
            performance_metrics=[metric for metric in ["loss", "accuracy"]],
            verbose=True)

        while not stopping_criteria.is_stop() and len(unlabel_idx) > 0:
            label_x = X[label_idx.index, :]
            label_y = y[label_idx.index]
            test_x = X[test_idx, :]
            test_y = y[test_idx]

            # Train and evaluate Model over the labeled instances
            ml_technique.fit(label_x,
                             label_y,
                             batch_size=batch_size,
                             epochs=epochs,
                             verbose=True,
                             validation_data=(test_x, test_y))

            # predict the results over the labeled test instances
            label_pred = ml_technique.predict_classes(test_x)

            # performance calc for all metrics
            label_perf = []
            score = ml_technique.evaluate(x_test, y_test, verbose=1)

            label_perf.append({"name": "loss", "value": score[0]})
            label_perf.append({"name": "accuracy", "value": score[1]})

            # use the query strategy for selecting the indexes
            select_ind = query_strategy.select(X=X,
                                               y=y,
                                               label_index=label_idx,
                                               unlabel_index=unlabel_idx,
                                               batch_size=batch_size,
                                               model=ml_technique,
                                               client=self.__client)

            # show label values
            oracle.query(instances=X[select_ind], indexes=select_ind)

            # update label and unlabel instaces
            label_idx.update(select_ind)
            unlabel_idx.difference_update(select_ind)

            # save intermediate results
            experimentState.add_state(
                StateItem(select_index=select_ind,
                          performance_metrics=[
                              metric['name'] for metric in label_perf
                          ],
                          performance=label_perf))

            # update stopping_criteria
            stopping_criteria.update_information(experimentState)

        end_time = time.time() - start_time
        print(end_time)

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric for metric in ["loss", "accuracy"]],
            method_name=query_strategy.query_function_name,
            method_results=[experimentState],
            type="queries")

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')
예제 #11
0
    def test_custom_activeLearning_keras(self):
        batch_size = 5
        epochs = 20

        # partition the data
        train_idx, test_idx, label_idx, unlabel_idx = split(
            X=self.__X,
            y=self.__y,
            test_ratio=0.3,
            initial_label_rate=0.05,
            split_count=1,
            all_class=True)

        # convert to indexed collection
        train_idx = IndexCollection(train_idx[0])
        test_idx = IndexCollection(test_idx[0])
        label_idx = IndexCollection(label_idx[0])
        unlabel_idx = IndexCollection(unlabel_idx[0])

        # Create the model
        ml_technique = Sequential()
        ml_technique.add(Dense(input_dim=30, units=30))
        ml_technique.add(Dense(input_dim=30, units=30))
        ml_technique.add(Dense(input_dim=30, units=2))
        ml_technique.add(Activation('softmax'))
        ml_technique.compile(loss='sparse_categorical_crossentropy',
                             optimizer='adam',
                             metrics=['accuracy'])

        # Define the active learning components
        stopping_criteria = MaxIteration(10)
        query_strategy = QueryLeastConfidentSampling()
        performance_metrics = [
            Accuracy(), F1(average='weighted'),
            HammingLoss()
        ]
        oracle = SimulatedOracle(labels=self.__y)

        start_time = time.time()
        experimentState = State(round=0,
                                train_idx=train_idx,
                                test_idx=test_idx,
                                init_L=label_idx,
                                init_U=unlabel_idx,
                                performance_metrics=[
                                    metric.metric_name
                                    for metric in performance_metrics
                                ],
                                verbose=True)

        while not stopping_criteria.is_stop() and len(unlabel_idx) > 0:

            label_x = self.__X[label_idx.index, :]
            label_y = self.__y[label_idx.index]
            test_x = self.__X[test_idx, :]
            test_y = self.__y[test_idx]

            # Train and evaluate Model over the labeled instances
            ml_technique.fit(label_x,
                             label_y,
                             batch_size=batch_size,
                             epochs=epochs,
                             verbose=True)

            # predict the results over the labeled test instances
            label_pred = ml_technique.predict_classes(test_x)

            # performance calc for all metrics
            label_perf = []
            for metric in performance_metrics:
                value = metric.compute(y_true=test_y, y_pred=label_pred)
                label_perf.append({"name": metric.metric_name, "value": value})

            # use the query strategy for selecting the indexes
            select_ind = query_strategy.select(X=self.__X,
                                               y=self.__y,
                                               label_index=label_idx,
                                               unlabel_index=unlabel_idx,
                                               batch_size=batch_size,
                                               model=ml_technique,
                                               client=self.__client)

            # show label values
            oracle.query(instances=self.__X[select_ind], indexes=select_ind)

            # update label and unlabel instaces
            label_idx.update(select_ind)
            unlabel_idx.difference_update(select_ind)

            # save intermediate results
            experimentState.add_state(
                StateItem(select_index=select_ind,
                          performance_metrics=[
                              metric['name'] for metric in label_perf
                          ],
                          performance=label_perf))

            # update stopping_criteria
            stopping_criteria.update_information(experimentState)

        end_time = time.time() - start_time
        print(end_time)

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=[experimentState],
            type="queries")

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')