示例#1
0
class TestCPerfEvaluator(CUnitTest):
    """Unit test for CKernel."""
    def setUp(self):

        # Create dummy dataset (we want a test different from train)
        loader = CDLRandom(random_state=50000)
        self.training_dataset = loader.load()
        self.test_dataset = loader.load()

        # CREATE CLASSIFIERS
        kernel = CKernel.create('rbf')
        self.svm = CClassifierSVM(kernel=kernel)
        self.svm.verbose = 1

        self.logger.info("Using kernel {:}".format(self.svm.kernel.class_type))

    def test_parameters_setting(self):

        # Changing default parameters to be sure are not used
        self.svm.set_params({'C': 25, 'kernel.gamma': 1e-1, 'n_jobs': 2})

        xval_parameters = {'C': [1, 10, 100], 'kernel.gamma': [1, 50]}

        # DO XVAL FOR CHOOSE BEST PARAMETERS
        xval_splitter = CDataSplitter.create('kfold',
                                             num_folds=5,
                                             random_state=50000)

        # Set the best parameters inside the classifier
        self.svm.estimate_parameters(self.training_dataset, xval_parameters,
                                     xval_splitter, 'accuracy')

        self.logger.info("SVM has now the following parameters: {:}".format(
            self.svm.get_params()))

        self.assertEqual(self.svm.get_params()['C'], 1)
        self.assertEqual(self.svm.get_params()['kernel.gamma'], 50)

        # Now we compare the parameters chosen before with a new evaluator
        perf_eval = CPerfEvaluatorXVal(xval_splitter,
                                       CMetric.create('accuracy'))
        perf_eval.verbose = 1

        best_params, best_score = perf_eval.evaluate_params(
            self.svm, self.training_dataset, xval_parameters)

        for param in xval_parameters:
            self.logger.info("Best '{:}' is: {:}".format(
                param, best_params[param]))
            self.assertEqual(best_params[param], self.svm.get_params()[param])

        self.svm.verbose = 0

        parameters_combination = [[1, 1], [1, 50], [10, 1], [10, 50], [100, 1],
                                  [100, 50]]
        par_comb_score = CArray.zeros(len(parameters_combination))
        for comb in range(len(parameters_combination)):

            this_fold_score = []
            num_xval_fold = len(xval_splitter.tr_idx)

            for f in range(num_xval_fold):
                self.svm.set("C", parameters_combination[comb][0])
                self.svm.kernel.gamma = parameters_combination[comb][1]

                self.svm.fit(
                    self.training_dataset[xval_splitter.tr_idx[f], :].X,
                    self.training_dataset[xval_splitter.tr_idx[f], :].Y)

                this_fold_predicted = self.svm.predict(
                    self.training_dataset[xval_splitter.ts_idx[f], :].X)

                this_fold_accuracy = skm.accuracy_score(
                    self.training_dataset[
                        xval_splitter.ts_idx[f], :].Y.get_data(),
                    this_fold_predicted.get_data())
                this_fold_score.append(this_fold_accuracy)

            par_comb_score[comb] = (np.mean(this_fold_score))
            self.logger.info("this fold mean: {:}".format(
                par_comb_score[comb]))

        max_combination_score = par_comb_score.max()
        better_param_comb = parameters_combination[par_comb_score.argmax()]
        self.logger.info("max combination score founded here: {:}".format(
            max_combination_score))
        self.logger.info(
            "max comb score founded during xval {:}".format(best_score))

        self.assertEqual(max_combination_score, best_score)

        # set parameters found by xval and check if are the same chosen here
        self.logger.info("the parameters selected by own xval are:")
        self.svm.set_params(best_params)
        self.logger.info("C: {:}".format(self.svm.C))
        self.logger.info("kernel.gamma: {:}".format(self.svm.kernel.gamma))
        # check c
        self.assertEqual(better_param_comb[0], self.svm.C)
        # check gamma
        self.assertEqual(better_param_comb[1], self.svm.kernel.gamma)

    def test_nan_metric_value(self):

        # Changing default parameters to be sure are not used
        self.svm.set_params({'C': 25, 'kernel.gamma': 1e-1})
        xval_parameters = {'C': [1, 10, 100], 'kernel.gamma': [1, 50]}

        # DO XVAL FOR CHOOSE BEST PARAMETERS
        xval_splitter = CDataSplitter.create('kfold',
                                             num_folds=5,
                                             random_state=50000)

        self.logger.info("Testing metric with some nan")

        some_nan_metric = CMetricFirstNan()

        # Now we compare the parameters chosen before with a new evaluator
        perf_eval = CPerfEvaluatorXVal(xval_splitter, some_nan_metric)
        perf_eval.verbose = 1

        best_params, best_score = perf_eval.evaluate_params(
            self.svm, self.training_dataset, xval_parameters, pick='last')

        self.logger.info("best score : {:}".format(best_score))

        # The xval should select the only one actual value (others are nan)
        self.assertEqual(best_score, 1.)

        self.logger.info("Testing metric with all nan")

        # This test case involves an all-nan slice
        self.logger.filterwarnings(action="ignore",
                                   message="All-NaN slice encountered",
                                   category=RuntimeWarning)

        all_nan_metric = CMetricAllNan()

        # Now we compare the parameters chosen before with a new evaluator
        perf_eval = CPerfEvaluatorXVal(xval_splitter, all_nan_metric)
        perf_eval.verbose = 1

        with self.assertRaises(ValueError):
            perf_eval.evaluate_params(self.svm,
                                      self.training_dataset,
                                      xval_parameters,
                                      pick='last')

    def _run_multiclass(self, tr, multiclass, xval_params, expected_best):

        xval_splitter = CDataSplitter.create('kfold',
                                             num_folds=3,
                                             random_state=50000)

        # Set the best parameters inside the classifier
        best_params = multiclass.estimate_parameters(tr, xval_params,
                                                     xval_splitter, 'accuracy')

        self.logger.info(
            "Multiclass SVM has now the following parameters: {:}".format(
                multiclass.get_params()))

        for clf_idx, clf in enumerate(multiclass._binary_classifiers):
            self.assertEqual(clf.C, expected_best['C'])
            self.assertEqual(clf.kernel.gamma, expected_best['kernel.gamma'])

        # Final test: fit using best parameters
        multiclass.fit(tr.X, tr.Y)

        for clf in multiclass._binary_classifiers:
            for param in best_params:
                self.assertEqual(clf.get_params()[param], best_params[param])

    def test_params_multiclass(self):
        """Parameter estimation for multiclass classifiers."""
        # Create dummy dataset (we want a test different from train)
        tr = CDLRandom(n_classes=4, n_clusters_per_class=1,
                       random_state=50000).load()

        kernel = CKernel.create('rbf')
        multiclass = CClassifierMulticlassOVA(CClassifierSVM,
                                              C=1,
                                              kernel=kernel)
        multiclass.verbose = 1

        xval_parameters = {'C': [1, 10, 100], 'kernel.gamma': [0.1, 1]}

        expected = {'C': 10.0, 'kernel.gamma': 0.1}

        self._run_multiclass(tr, multiclass, xval_parameters, expected)

        self.logger.info("Testing with preprocessor")

        kernel = CKernel.create('rbf')
        multiclass = CClassifierMulticlassOVA(CClassifierSVM,
                                              C=1,
                                              kernel=kernel,
                                              preprocess='min-max')
        multiclass.verbose = 1

        xval_parameters = {'C': [1, 10, 100], 'kernel.gamma': [0.1, 1]}

        expected = {'C': 10.0, 'kernel.gamma': 0.1}

        self._run_multiclass(tr, multiclass, xval_parameters, expected)
##train a classifier (SVM with linear kernel)
from secml.ml.classifiers import CClassifierSVM
clf_lin = CClassifierSVM()

#problem seems linearly decidable -> try a logistic regression classifier without any parameter estimations
from secml.ml.classifiers import CClassifierLogistic
#clf_l= CClassifierLogistic()

xval_lin_params = {'C': [0.01, 0.1, 1, 10, 100]}

# Select and set the best training parameters for the linear classifier
print("Estimating the best training parameters for linear kernel...")
best_lin_params = clf_lin.estimate_parameters(dataset=tr_set,
                                              parameters=xval_lin_params,
                                              splitter=xval_splitter,
                                              metric='accuracy',
                                              perf_evaluator='xval')

clf_lin.fit(tr_set)

## Select and set the best training parameters for the linear classifier
#print("Estimating the best training parameters for linear kernel...")
#best_lin_params = clf_l.estimate_parameters(
#    dataset=tr_set,
#    parameters=xval_lin_params,
#    splitter=xval_splitter,
#    metric='accuracy',
#    perf_evaluator='xval'
#)
clf_lin = CClassifierSVM()

#from secml.ml.kernel import CKernelRBF
#clf_rbf = CClassifierSVM(kernel=CKernelRBF())

#problem seems linearly decidable -> try a logistic regression classifier without any parameter estimations
from secml.ml.classifiers import CClassifierLogistic
#clf_l= CClassifierLogistic()

xval_lin_params = {'C': [0.01, 0.1, 1, 10, 100]}

# Select and set the best training parameters for the linear classifier
print("Estimating the best training parameters for linear kernel...")
best_lin_params = clf_lin.estimate_parameters(dataset=data_smp_encoded_secML,
                                              parameters=xval_lin_params,
                                              splitter=xval_splitter,
                                              metric='accuracy',
                                              perf_evaluator='xval')

# Select and set the best training parameters for the RBF classifier
#print("Estimating the best training parameters for RBF kernel...")
#best_rbf_params = clf_rbf.estimate_parameters(
#    dataset=tr,
#    parameters=xval_rbf_params,
#    splitter=xval_splitter,
#    metric='accuracy',
#    perf_evaluator='xval'
#)
print(best_lin_params)

#train classifier
from secml.ml.kernel import CKernelRBF

clf = CClassifierSVM(kernel=CKernelRBF())

# Parameters for the Cross-Validation procedure
xval_params = {'C': [1e-2, 0.1, 1], 'kernel.gamma': [10, 100, 1e3]}

# Let's create a 3-Fold data splitter
from secml.data.splitter import CDataSplitterKFold
xval_splitter = CDataSplitterKFold(num_folds=3, random_state=random_state)

# Select and set the best training parameters for the classifier
print("Estimating the best training parameters...")
best_params = clf.estimate_parameters(dataset=tr,
                                      parameters=xval_params,
                                      splitter=xval_splitter,
                                      metric='accuracy',
                                      perf_evaluator='xval')
print("The best training parameters are: ", best_params)

# We can now fit the classifier
clf.fit(tr)

# Compute predictions on a test set
y_pred = clf.predict(ts.X)

# Evaluate the accuracy of the classifier
acc = metric.performance_score(y_true=ts.Y, y_pred=y_pred)

print("Accuracy on test set: {:.2%}".format(acc))