def test_classification_pandas_support(tmp_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude={'classifier': ['libsvm_svc']},
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
    )

    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    prediction = automl.predict(X)
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
Пример #2
0
def test_autosklearn_classification_methods_returns_self(dask_client):
    """
    Currently this method only tests that the methods of AutoSklearnClassifier
    is able to fit using fit(), fit_ensemble() and refit()
    """
    X_train, y_train, X_test, y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=10,
                                   ensemble_size=0,
                                   dask_client=dask_client,
                                   exclude_preprocessors=['fast_ica'])

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
Пример #3
0
    def classification(self, metric="accuracy"):
        """
        Perform auto_classification.
        Args:
            metric (str): The evaluation metric of classification.
                 This will be mapped by AutoSklearnML.get_classification_metric
                 to an instance of :class:`autosklearn.metrics.Scorer` as
                 created by :meth:`autosklearn.metrics.make_scorer`.
                 Default metric: "accuracy".
                 Other supported metrics: "balanced_accuracy", "f1",
                                          "roc_auc", "average_precision",
                                          "precision", "recall"

        Returns:

        """
        auto_classifier = AutoSklearnClassifier(**self.auto_sklearn_kwargs)
        classification_metric = AutoSklearnML.get_classification_metric(metric)
        auto_classifier.fit(self._X_train.copy(),
                            self._y_train.copy(),
                            metric=classification_metric,
                            dataset_name=self.dataset_name)

        print(auto_classifier.show_models())

        if self.auto_sklearn_kwargs["resampling_strategy"] == "cv":
            auto_classifier.refit(self._X_train.copy(), self._y_train.copy())

        prediction_train = auto_classifier.predict(self._X_train)
        print("training set {} score: {}".format(
            metric,
            classification_metric._score_func(self._y_train,
                                              prediction_train)))

        prediction_test = auto_classifier.predict(self._X_test)
        print("test set {} score: {}".format(
            metric,
            classification_metric._score_func(self._y_test, prediction_test)))

        with open(
                os.path.join(self.auto_sklearn_kwargs['output_folder'],
                             'best_auto_sklearn_output.log'), 'a+') as wf:
            wf.write('The best model is : \n')
            wf.write(auto_classifier.show_models())
            wf.write("\ntraining set {} score: {}\n".format(
                metric,
                classification_metric._score_func(self._y_train,
                                                  prediction_train)))
            wf.write('\n')
            wf.write("test set {} score: {}".format(
                metric,
                classification_metric._score_func(self._y_test,
                                                  prediction_test)))

        dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'],
                                 'automl_classification.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(auto_classifier, f)

        return auto_classifier