def test_classification_pandas_support(tmp_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude={'classifier': ['libsvm_svc']}, dask_client=dask_client, seed=5, tmp_folder=tmp_dir, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. prediction = automl.predict(X) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def test_autosklearn_classification_methods_returns_self(dask_client): """ Currently this method only tests that the methods of AutoSklearnClassifier is able to fit using fit(), fit_ensemble() and refit() """ X_train, y_train, X_test, y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=60, per_run_time_limit=10, ensemble_size=0, dask_client=dask_client, exclude_preprocessors=['fast_ica']) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) assert automl is automl_ensemble_fitted automl_refitted = automl.refit(X_train.copy(), y_train.copy()) assert automl is automl_refitted
def classification(self, metric="accuracy"): """ Perform auto_classification. Args: metric (str): The evaluation metric of classification. This will be mapped by AutoSklearnML.get_classification_metric to an instance of :class:`autosklearn.metrics.Scorer` as created by :meth:`autosklearn.metrics.make_scorer`. Default metric: "accuracy". Other supported metrics: "balanced_accuracy", "f1", "roc_auc", "average_precision", "precision", "recall" Returns: """ auto_classifier = AutoSklearnClassifier(**self.auto_sklearn_kwargs) classification_metric = AutoSklearnML.get_classification_metric(metric) auto_classifier.fit(self._X_train.copy(), self._y_train.copy(), metric=classification_metric, dataset_name=self.dataset_name) print(auto_classifier.show_models()) if self.auto_sklearn_kwargs["resampling_strategy"] == "cv": auto_classifier.refit(self._X_train.copy(), self._y_train.copy()) prediction_train = auto_classifier.predict(self._X_train) print("training set {} score: {}".format( metric, classification_metric._score_func(self._y_train, prediction_train))) prediction_test = auto_classifier.predict(self._X_test) print("test set {} score: {}".format( metric, classification_metric._score_func(self._y_test, prediction_test))) with open( os.path.join(self.auto_sklearn_kwargs['output_folder'], 'best_auto_sklearn_output.log'), 'a+') as wf: wf.write('The best model is : \n') wf.write(auto_classifier.show_models()) wf.write("\ntraining set {} score: {}\n".format( metric, classification_metric._score_func(self._y_train, prediction_train))) wf.write('\n') wf.write("test set {} score: {}".format( metric, classification_metric._score_func(self._y_test, prediction_test))) dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'], 'automl_classification.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(auto_classifier, f) return auto_classifier