Exemplo n.º 1
0
def test_regression_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=41514,  # diabetes
        return_X_y=True,
        as_frame=True,
    )
    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=40,
        per_run_time_limit=5,
        dask_client=dask_client,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    # Make sure we error out because y is not encoded
    automl.fit(X, y)

    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) >= 0.5, extract_msg_from_log(log_file_path)

    automl.refit(X, y)

    # Make sure that at least better than random.
    assert r2(y, automl.predict(X)) > 0.5, extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
Exemplo n.º 2
0
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
Exemplo n.º 3
0
def test_binary(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=40,
                                   per_run_time_limit=10,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]
    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), extract_msg_from_log(log_file_path)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)

    output_files = glob.glob(
        os.path.join(output_dir, 'binary_test_dataset_test_*.predict'))
    assert len(output_files) > 0, (output_files,
                                   extract_msg_from_log(log_file_path))
Exemplo n.º 4
0
def test_multilabel(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_multilabel=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=30,
                                   per_run_time_limit=5,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train, Y_train)
    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]
    predictions = automl.predict(X_test)
    assert predictions.shape == (50, 3), extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)

    score = f1_macro(Y_test, predictions)
    assert score >= 0.9, extract_msg_from_log(log_file_path)

    probs = automl.predict_proba(X_train)
    assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
Exemplo n.º 5
0
def test_classification_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude_estimators=['libsvm_svc'],
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    automl.fit(X, y)

    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, extract_msg_from_log(log_file_path)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    y = automl.automl_.InputValidator.encode_target(y)
    prediction = automl.automl_.InputValidator.encode_target(automl.predict(X))
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
Exemplo n.º 6
0
def test_regression(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    predictions = automl.predict(X_test)
    assert predictions.shape == (356, )
    score = mean_squared_error(Y_test, predictions)

    # On average np.sqrt(30) away from the target -> ~5.5 on average
    # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
    # constraint. With more time_left_for_this_task this is no longer an issue
    assert score >= -37, extract_msg_from_log(log_file_path)
    assert count_succeses(automl.cv_results_) > 0
Exemplo n.º 7
0
def test_automl_outputs(backend, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    name = 'iris'
    data_manager_file = os.path.join(backend.temporary_directory,
                                     '.auto-sklearn', 'datamanager.pkl')

    auto = autosklearn.automl.AutoML(
        backend,
        30,
        5,
        initial_configurations_via_metalearning=0,
        seed=100,
        metric=accuracy,
        dask_client=dask_client,
    )
    setup_logger()
    auto._logger = get_logger('test_automl_outputs')
    auto.fit(
        X=X_train,
        y=Y_train,
        X_test=X_test,
        y_test=Y_test,
        dataset_name=name,
        task=MULTICLASS_CLASSIFICATION,
    )

    # Log file path
    log_file_path = glob.glob(
        os.path.join(backend.temporary_directory, 'AutoML*.log'))[0]

    # pickled data manager (without one hot encoding!)
    with open(data_manager_file, 'rb') as fh:
        D = pickle.load(fh)
        assert np.allclose(D.data['X_train'], X_train)

    # Check that all directories are there
    fixture = [
        'true_targets_ensemble.npy',
        'start_time_100',
        'datamanager.pkl',
        'ensemble_read_preds.pkl',
        'ensemble_read_scores.pkl',
        'runs',
        'ensembles',
    ]
    assert (sorted(
        os.listdir(os.path.join(backend.temporary_directory,
                                '.auto-sklearn'))) == sorted(fixture))

    # At least one ensemble, one validation, one test prediction and one
    # model and one ensemble
    fixture = glob.glob(
        os.path.join(
            backend.temporary_directory,
            '.auto-sklearn',
            'runs',
            '*',
            'predictions_ensemble*npy',
        ))
    assert len(fixture) > 0

    fixture = glob.glob(
        os.path.join(backend.temporary_directory, '.auto-sklearn', 'runs', '*',
                     '100.*.model'))
    assert len(fixture) > 0

    fixture = os.listdir(
        os.path.join(backend.temporary_directory, '.auto-sklearn',
                     'ensembles'))
    assert '100.0000000000.ensemble' in fixture

    # Start time
    start_time_file_path = os.path.join(backend.temporary_directory,
                                        '.auto-sklearn', "start_time_100")
    with open(start_time_file_path, 'r') as fh:
        start_time = float(fh.read())
    assert time.time() - start_time >= 10, extract_msg_from_log(log_file_path)

    del auto