def test_classification_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], seed=5, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.555) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl._automl[0].InputValidator.encode_target(y) prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X)) self.assertTrue(accuracy(y, prediction) > 0.555)
def test_binary(self): tmp = os.path.join(self.test_dir, '..', '.out_binary_fit') output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) self._setUp(tmp) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9) output_files = os.listdir(output) self.assertIn('binary_test_dataset_test_1.predict', output_files)
def test_binary(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=40, per_run_time_limit=10, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) assert predictions.shape == (50, ), print_debug_information(automl) score = accuracy(Y_test, predictions) assert score > 0.9, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) output_files = glob.glob( os.path.join(output_dir, 'binary_test_dataset_test_*.predict')) assert len(output_files) > 0, (output_files, print_debug_information(automl))
def test_binary(tmp_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris', make_binary=True) automl = AutoSklearnClassifier( time_left_for_this_task=40, delete_tmp_folder_after_terminate=False, per_run_time_limit=10, tmp_folder=tmp_dir, dask_client=dask_client, ) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) assert predictions.shape == (50, ), print_debug_information(automl) score = accuracy(Y_test, predictions) assert score > 0.9, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) assert includes_all_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def test_classification_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], dask_client=dask_client, seed=5, tmp_folder=tmp_dir, output_folder=output_dir, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. prediction = automl.predict(X) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def test_classification_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], dask_client=dask_client, seed=5, tmp_folder=tmp_dir, output_folder=output_dir, ) automl.fit(X, y) log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, extract_msg_from_log(log_file_path) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl.automl_.InputValidator.encode_target(y) prediction = automl.automl_.InputValidator.encode_target(automl.predict(X)) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0
def test_binary(self): tmp = os.path.join(self.test_dir, '..', '.out_binary_fit') output = os.path.join(self.test_dir, '..', '.tmp_binary_fit') self._setUp(output) self._setUp(tmp) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'iris', make_binary=True) automl = AutoSklearnClassifier(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test, dataset_name='binary_test_dataset') predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (50, )) score = accuracy(Y_test, predictions) self.assertGreaterEqual(score, 0.9) output_files = os.listdir(output) self.assertIn('binary_test_dataset_test_1.predict', output_files)