Exemplo n.º 1
0
    def test_cv_regression(self):
        """
        Makes sure that when using a cv strategy, we are able to fit
        a regressor
        """
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                      per_run_time_limit=5,
                                      resampling_strategy='cv',
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356,))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
        # constraint. With more time_left_for_this_task this is no longer an issue
        self.assertGreaterEqual(score, -37)

        self._tearDown(tmp)
        self._tearDown(output)
Exemplo n.º 2
0
def test_regression_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=41514,  # diabetes
        return_X_y=True,
        as_frame=True,
    )
    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=40,
        per_run_time_limit=5,
        dask_client=dask_client,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    # Make sure we error out because y is not encoded
    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) >= 0.5, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
Exemplo n.º 3
0
    def test_cv_regression(self):
        """
        Makes sure that when using a cv strategy, we are able to fit
        a regressor
        """
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'boston', train_size_maximum=300)
        automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                      per_run_time_limit=10,
                                      resampling_strategy='cv',
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (206, ))
        score = r2(Y_test, predictions)
        print(Y_test)
        print(predictions)
        self.assertGreaterEqual(score, 0.1)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)

        self._tearDown(tmp)
        self._tearDown(output)
Exemplo n.º 4
0
    def test_regression_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=41514,  # diabetes
            return_X_y=True,
            as_frame=True,
        )
        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnRegressor(
            time_left_for_this_task=30,
            per_run_time_limit=5,
        )

        # Make sure we error out because y is not encoded
        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.5)

        automl.refit(X, y)

        # Make sure that at least better than random.
        self.assertTrue(r2(y, automl.predict(X)) > 0.5)
Exemplo n.º 5
0
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
Exemplo n.º 6
0
def train_autosklearn(l=None):
    if l is None:
        l = get_data()
    ensemble_size = 1  # 50 ... 1 for vanilla
    initial_configurations_via_metalearning = 0  # 25 ... 0 for vanilla
    model = AutoSklearnRegressor(
        delete_output_folder_after_terminate=True,
        delete_tmp_folder_after_terminate=True,
        disable_evaluator_output=False,
        ensemble_nbest=50,
        ensemble_size=ensemble_size,
        exclude_estimators=None,
        exclude_preprocessors=None,
        get_smac_object_callback=None,
        include_estimators=None,
        include_preprocessors=None,
        initial_configurations_via_metalearning=
        initial_configurations_via_metalearning,
        logging_config=None,
        ml_memory_limit=3072,
        output_folder=None,
        per_run_time_limit=360,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5},
        # resampling_strategy='holdout',
        # resampling_strategy_arguments=None,
        seed=1,
        shared_mode=False,
        smac_scenario_args=None,
        time_left_for_this_task=3600,
        tmp_folder=None)
    model.fit(l.X_train.values.copy(), l.y_train.values.squeeze().copy())
    model.refit(l.X_train.values.copy(), l.y_train.values.squeeze().copy())
    print(model.show_models())
    return attributedict_from_locals('model')
Exemplo n.º 7
0
    def spawn_regressor(
            seed,
            time,
            search_space,
            prep_space,
            metric,
            dataset_name=None):
        """Spawn a subprocess.

        auto-sklearn does not take care of spawning worker processes. This
        function, which is called several times in the main block is a new
        process which runs one instance of auto-sklearn.
        """

        # Use the initial configurations from meta-learning only in one out of
        # the four processes spawned. This prevents auto-sklearn from evaluating
        # the same configurations in four processes.
        if seed == 0:
            initial_configurations_via_metalearning = 25
            smac_scenario_args = {}
        else:
            initial_configurations_via_metalearning = 0
            smac_scenario_args = {'initial_incumbent': 'RANDOM'}

        # Arguments which are different to other runs of auto-sklearn:
        # 1. all classifiers write to the same output directory
        # 2. shared_mode is set to True, this enables sharing of data between
        # models.
        # 3. all instances of the AutoSklearnClassifier must have a different
        # seed!
        automl = AutoSklearnRegressor(
            time_left_for_this_task=time,
            # sec., how long should this seed fit process run
            per_run_time_limit=15,
            # sec., each model may only take this long before it's killed
            ml_memory_limit=1024,
            # MB, memory limit imposed on each call to a ML algorithm
            shared_mode=True,  # tmp folder will be shared between seeds
            tmp_folder=tmp_folder,
            output_folder=output_folder,
            delete_tmp_folder_after_terminate=False,
            ensemble_size=0,
            include_estimators=search_space, exclude_estimators=None,
            include_preprocessors=prep_space, exclude_preprocessors=None,
            # ensembles will be built when all optimization runs are finished
            initial_configurations_via_metalearning=(
                initial_configurations_via_metalearning
            ),
            seed=seed,
            smac_scenario_args=smac_scenario_args,
        )
        automl.fit(X_train, y_train, X_test=X_test, y_test=y_test,
                   metric=metric, dataset_name=dataset_name)
        # print(automl.cv_results_)
        return automl.cv_results_
Exemplo n.º 8
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnRegressor()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Exemplo n.º 9
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnRegressor()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Exemplo n.º 10
0
    def test_regression(self):
        output = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=output,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356, ))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
Exemplo n.º 11
0
    def test_regression(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356,))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
Exemplo n.º 12
0
def train_regression():
    dump_file = os.path.join(
        AUTO_ML_MODELS_PATH,
        'auto_sklearn_regressor' + str(datetime.datetime.now()) + '.dump')

    features, outcome_slave, _ = file_loader('c99temp_train.snappy.csv')

    features = features.values
    outcome_slave = outcome_slave['tempBoardSLAVE'].values

    model = AutoSklearnRegressor(
        time_left_for_this_task=3600,
        per_run_time_limit=600,
    )
    model.fit(features, outcome_slave)

    with open(dump_file, 'wb') as f:
        pickle.dump(model, f)
Exemplo n.º 13
0
class AutoML(AbstractModel):
    def __init__(self):
        super().__init__()
        self.model = AutoSklearnRegressor

    def fit(self, x, y, modeldict=None):
        if not self.m:
            self.param_search(x, y)
        self.m.refit(x, y)

    def param_search(self, x, y, time_per_sample=3.5, **kwargs):
        time = int(len(y) * time_per_sample)
        self.m = AutoSklearnRegressor(
            time_left_for_this_task=time,
            resampling_strategy="cv",
            resampling_strategy_arguments={'folds': 10})

        self.m.fit(x,
                   y,
                   metric=mean_squared_error,
                   dataset_name="Land Use Regression")
        # print(self.m.sprint_statistics())
        # score = score_funtion(y, self.m.predict(x))
        # print("Reached a score of {}.".format(score))

        kf = KFold(n_splits=10, shuffle=True)
        rmse = []
        mae = []
        r2 = []
        for train_index, test_index in kf.split(x, y):
            X_train, X_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.m.refit(X_train, y_train)
            predictions = self.m.predict(X_test)
            rmse_iter, mae_iter, r2_iter = self.score_function(
                y_test, predictions)
            rmse.append(rmse_iter)
            mae.append(mae_iter)
            r2.append(r2_iter)

        # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2)))

        return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
Exemplo n.º 14
0
def test_regression(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (356, )
    score = mean_squared_error(Y_test, predictions)

    # On average np.sqrt(30) away from the target -> ~5.5 on average
    # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
    # constraint. With more time_left_for_this_task this is no longer an issue
    assert score >= -37, print_debug_information(automl)
    assert count_succeses(automl.cv_results_) > 0
Exemplo n.º 15
0
    def test_regression_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Exemplo n.º 16
0
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
Exemplo n.º 17
0
    def test_regression_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Exemplo n.º 18
0
def test_autosklearn_regression_methods_returns_self(dask_client):
    X_train, y_train, X_test, y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  dask_client=dask_client,
                                  ensemble_size=0)

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
Exemplo n.º 19
0
def test_type_of_target(mock_estimator):
    # Test that classifier raises error for illegal target types.
    X = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
    ])
    # Possible target types
    y_binary = np.array([0, 0, 1, 1])
    y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
    y_multiclass = np.array([0, 1, 2, 0])
    y_multilabel = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [0, 0],
    ])
    y_multiclass_multioutput = np.array([
        [0, 1],
        [1, 3],
        [2, 2],
        [5, 3],
    ])
    y_continuous_multioutput = np.array([
        [0.1, 1.5],
        [1.2, 3.5],
        [2.7, 2.7],
        [5.5, 3.9],
    ])

    cls = AutoSklearnClassifier(ensemble_size=0)
    cls.automl_ = unittest.mock.Mock()
    cls.automl_.InputValidator = unittest.mock.Mock()
    cls.automl_.InputValidator.target_validator = unittest.mock.Mock()

    # Illegal target types for classification: continuous,
    # multiclass-multioutput, continuous-multioutput.
    expected_msg = r".*Classification with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_multiclass_multioutput)

    expected_msg = r".*Classification with data of type"
    " continuous is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous)

    expected_msg = r".*Classification with data of type"
    " continuous-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous_multioutput)

    # Legal target types for classification: binary, multiclass,
    # multilabel-indicator.
    try:
        cls.fit(X, y_binary)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "binary targets")

    try:
        cls.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        cls.fit(X, y_multilabel)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multilabel-indicator targets")

    # Test that regressor raises error for illegal target types.
    reg = AutoSklearnRegressor(ensemble_size=0)
    # Illegal target types for regression: multilabel-indicator
    # multiclass-multioutput
    expected_msg = r".*Regression with data of type"
    " multilabel-indicator is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multilabel,
        )

    expected_msg = r".*Regression with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multiclass_multioutput,
        )

    # Legal target types: continuous, multiclass,
    # continuous-multioutput,
    # binary
    try:
        reg.fit(X, y_continuous)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous targets")

    try:
        reg.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        reg.fit(X, y_continuous_multioutput)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous_multioutput targets")

    try:
        reg.fit(X, y_binary)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "binary targets")
Exemplo n.º 20
0
#                               ensemble_size=1, initial_configurations_via_metalearning=0,
#                               resampling_strategy_arguments={'folds': 5})

#-----REGRESSION-----
automl = AutoSklearnRegressor(
    per_run_time_limit=360,
    ml_memory_limit=1024 * 8,
    time_left_for_this_task=3600,
    resampling_strategy='cv',
    #                              ensemble_size=1,
    #                              initial_configurations_via_metalearning=0,
    resampling_strategy_arguments={'folds': 5})
start = time.time()

#X_train = X_train.astype('float') # when?
automl.fit(X_train, y_train,
           dataset_name='boston_housing')  #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(
    '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() -
                                                                 start))

predictions = automl.predict(X_test)
#print('--- CLASSIFICATION REPORT: ---')        #not for regression
#print(classification_report(y_test, predictions, digits=5))
print('\n\n--- MODELS: ---')
print(automl.show_models())
print('\n\n--- STATISTICS: ---')
print(automl.sprint_statistics())

#-----CLASSIFIER-----
#print('\n\n--- SCORE: ---')
Exemplo n.º 21
0
 dataframe = read_csv(address)
 print(time.strftime("Start time is %Y-%m-%d %H:%M:%S", time.localtime()))
 # split into input and output elements
 data = dataframe.values
 data = data.astype('int')
 X, y = data[:, :-1], data[:, -1]
 print(X.shape, y.shape)
 # split into train and test sets
 X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.33,
                                                     random_state=1)
 # define search
 model = AutoSklearnRegressor(time_left_for_this_task=5 * 60,
                              per_run_time_limit=30,
                              n_jobs=8)
 # perform the search
 model.fit(X_train, y_train)
 # summarize
 # print(model.sprint_statistics())
 # evaluate best model
 y_hat = model.predict(X_test)
 mae = mean_absolute_error(y_test, y_hat)
 r2Score = r2_score(y_test, y_hat)
 mape = mean_absolute_percentage_error(y_test, y_hat)
 mse = mean_squared_error(y_test, y_hat)
 print(time.strftime("End time is %Y-%m-%d %H:%M:%S", time.localtime()))
 print("MAE: %.3f" % mae)
 print("R2_score: %.3f" % r2Score)
 print("MAPE: %.3f" % mape)
 print("MSE: %.3f" % mse)
Exemplo n.º 22
0
if __name__ == "__main__":
    import numpy as np
    from autosklearn.regression import AutoSklearnRegressor
    from autosklearn.metrics import mean_squared_error
    import pickle

    #load X, y
    _file = open('data_BA.pkl', 'rb')
    X, y = pickle.load(_file)
    _file.close()

    #autosklearn
    regr = AutoSklearnRegressor(time_left_for_this_task=172800,
        per_run_time_limit = 600,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 4},
        metric=mean_squared_error,
        n_jobs=2,
    )
    regr.fit(X, y)

    #pickle best regressor
    _file = open('Autoskl_bestmodel.pkl', "wb")
    pickle.dump(regr, _file)
    _file.close()
Exemplo n.º 23
0
    def test_type_of_target(self, mock_estimator):
        # Test that classifier raises error for illegal target types.
        X = np.array([
            [1, 2],
            [2, 3],
            [3, 4],
            [4, 5],
        ])
        # Possible target types
        y_binary = np.array([0, 0, 1, 1])
        y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
        y_multiclass = np.array([0, 1, 2, 0])
        y_multilabel = np.array([
            [0, 1],
            [1, 1],
            [1, 0],
            [0, 0],
        ])
        y_multiclass_multioutput = np.array([
            [0, 1],
            [1, 3],
            [2, 2],
            [5, 3],
        ])
        y_continuous_multioutput = np.array([
            [0.1, 1.5],
            [1.2, 3.5],
            [2.7, 2.7],
            [5.5, 3.9],
        ])

        cls = AutoSklearnClassifier()
        # Illegal target types for classification: continuous,
        # multiclass-multioutput, continuous-multioutput.
        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " multiclass-multioutput is not supported",
            cls.fit,
            X=X,
            y=y_multiclass_multioutput,
        )

        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " continuous is not supported",
            cls.fit,
            X=X,
            y=y_continuous,
        )

        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " continuous-multioutput is not supported",
            cls.fit,
            X=X,
            y=y_continuous_multioutput,
        )

        # Legal target types for classification: binary, multiclass,
        # multilabel-indicator.
        try:
            cls.fit(X, y_binary)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "binary targets")

        try:
            cls.fit(X, y_multiclass)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "multiclass targets")

        try:
            cls.fit(X, y_multilabel)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "multilabel-indicator targets")

        # Test that regressor raises error for illegal target types.
        reg = AutoSklearnRegressor()
        # Illegal target types for regression: multiclass-multioutput,
        # multilabel-indicator, continuous-multioutput.
        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " multiclass-multioutput is not supported",
            reg.fit,
            X=X,
            y=y_multiclass_multioutput,
        )

        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " multilabel-indicator is not supported",
            reg.fit,
            X=X,
            y=y_multilabel,
        )

        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " continuous-multioutput is not supported",
            reg.fit,
            X=X,
            y=y_continuous_multioutput,
        )
        # Legal target types: continuous, binary, multiclass
        try:
            reg.fit(X, y_continuous)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "continuous targets")

        try:
            reg.fit(X, y_binary)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "binary targets")

        try:
            reg.fit(X, y_multiclass)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "multiclass targets")
Exemplo n.º 24
0
            output_folder=outpath + 'output_folder',
        )
    elif ml_type == 'classification':
        model = AutoSklearnClassifier(
            time_left_for_this_task=time_left_for_this_task,
            per_run_time_limit=per_run_time_limit,
            n_jobs=1,
            memory_limit=1000000,
            tmp_folder=outpath + 'log_folder',
            output_folder=outpath + 'output_folder',
        )
    print("start searching")

    # perform the search
    model.fit(X_train,
              y_train,
              dataset_name=ml_type + '_t' + str(time_left_for_this_task) +
              '_lead' + str(l))

    # summarize
    file = open(
        'log_files/' + ml_type + '_t' + str(time_left_for_this_task) +
        '_lead' + str(l) + '.txt', 'w')

    file.write(model.sprint_statistics())
    file.write('\n')
    file.write(model.show_models())
    file.close()

    print(model.sprint_statistics())
    print(model.show_models())
    # evaluate best model
Exemplo n.º 25
0

def offset_col_x_days(df, col, days):
    for x in range(1, days):
        df[f'{col}_prev_{x}'] = df[col].shift(x)
    df = df.dropna().reset_index(drop=True)
    return df


for i in range(len(stations)):
    stations[i] = offset_col_x_days(stations[i], 'energy', 7)
stations[0]

# In[67]:

X = stations[0]
y = X['energy']
X = X.drop(['name', 'date', 'energy'], axis=1)
train_test_split = int(len(X) * 0.8)
X_train, X_test = X[:train_test_split], X[train_test_split:]
y_train, y_test = y[:train_test_split], y[train_test_split:]

from autosklearn.regression import AutoSklearnRegressor

automl = AutoSklearnRegressor()
automl.fit(X_train, y_train)

print(automl.show_models())
predictions = automl.predict(X_test)
print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
                       n_features=10,
                       n_informative=5,
                       n_targets=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

############################################################################
# Build and fit a regressor
# =========================

automl = AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='/tmp/autosklearn_multioutput_regression_example_tmp',
)
automl.fit(X_train, y_train, dataset_name='synthetic')

############################################################################
# View the models found by auto-sklearn
# =====================================

print(automl.leaderboard())

############################################################################
# Print the final ensemble constructed by auto-sklearn
# ====================================================

print(automl.show_models())

###########################################################################
# Get the Score of the final ensemble
class AutoSklearnSolver:
    """ Model implementing through auto-sklearn.
    https://github.com/automl/auto-sklearn
    Класс реализует работу модели через функциональность auto-sklearn.

    Args:
        model_dir: Путь к директории модели
        time_limit: Временной лимит на обучение модели (с)
        memory_limit: Лимит на объем используемой памяти (Мб)

    Attributes:
        model_dir (str): Путь к каталогу модели
        config (Config): Параметры модели
        model ([AutoSklearnClassifier, AutoSklearnRegressor]): Объект модели auto-sklearn
        per_run_time_limit (int): Временной лимит на обучение модели
        metrics_object (autosklearn.metrics): Объект метрики качества содели
        procesed_data_path (str): Путь сохранения обработанных данных
    """
    def __init__(self,
                 model_dir: str,
                 time_limit: int = 0,
                 memory_limit: int = 0) -> None:
        os.makedirs(model_dir, exist_ok=True)
        self.model_dir = model_dir
        self.config = Config(model_dir, time_limit, memory_limit)
        self.model = None
        self.per_run_time_limit = min(360, time_limit // 2)

    @time_logging
    def fit(self, train_csv: str, mode: str, metrics_name: str,
            save_processed_data: bool) -> None:
        """Start model fitting
        Запуск процесса обучения модели

        Args:
            train_csv: Путь к обучающему датасету
            mode: Режим работы (классификация или регрессия)
            metrics_name: Имя объекта метрики качества в модуле autosklearn.metrics
            save_processed_data: Флаг сохранения датасета с обработанными данными
        """
        if not os.path.exists(train_csv):
            log('Data file {} is not exist!'.format(train_csv))
            return

        # получение объекта метрики
        try:
            self.metrics_object = getattr(metrics, metrics_name)
        except AttributeError as error:
            self.metrics_object = None
            log('Can\'t get the metrics object!')
            log('{}: {}'.format(type(error).__name__, error))
            return

        # подготовка каталога для сохранения данных
        if save_processed_data:
            self.procesed_data_path = os.path.join(self.model_dir,
                                                   'processed_data')
            os.makedirs(self.procesed_data_path, exist_ok=True)

        self.config['task'] = 'fit'
        self.config['mode'] = mode
        self.config['tmp_dir'] = self.config['model_dir'] + '/tmp'

        # удаление временной директории
        # (auto-sklearn ругается перед началом работы, если этого не делать)
        shutil.rmtree(self.config['tmp_dir'], ignore_errors=True)

        # первичный анализ, чтение данных, разбитие на матрицы X и y
        df = read_df(train_csv, self.config)
        y = df['target']
        X = df.drop('target', axis=1)

        # обработка данных
        process_dataframe(X, self.config)

        if save_processed_data:
            log('Saving processed data')
            X.to_csv(os.path.join(self.procesed_data_path, 'X.csv'))
            y.to_csv(os.path.join(self.procesed_data_path, 'y.csv'))

        # параметры создаваемой auto-sklearn модели
        # (выключаем препроцессинг, т.к. он уже проведен)
        model_params = {
            'time_left_for_this_task': self.config.time_left(),
            'per_run_time_limit': self.per_run_time_limit,
            'ml_memory_limit': self.config['memory_limit'],
            'tmp_folder': self.config['tmp_dir'],
            'include_preprocessors': ['no_preprocessing'],
            'delete_tmp_folder_after_terminate': True
        }

        # инициализация объекта модели
        self.model_init(model_params)

        # обучение модели
        self.model_fit(X, y, self.metrics_object)

        log('model_fitted: {}'.format(type(self.model)))
        log('autosklearn model contains:')
        log(self.model.show_models())

    @time_logging
    def model_init(self, model_params: Dict[str, Any]) -> None:
        """Model initialization
        Инициализация объекта модели в зависимости от типа задачи

        Args:
            model_params: Словарь параметров модели
        """
        if self.config['mode'] == 'classification':
            self.model = AutoSklearnClassifier(**model_params)
        elif self.config['mode'] == 'regression':
            self.model = AutoSklearnRegressor(**model_params)

    @time_logging
    def model_fit(self, X: pd.DataFrame, y: pd.Series,
                  metrics: Callable) -> None:
        """Model fitting wrapper
        Обертка для вызова fit (для учета времени в логе)

        Args:
            X: Матрица признаков
            y: Вектор ответов
            metrics: Объект метрики качества
        """
        # подавляем вывод предупреждений в лог
        warnings.filterwarnings('ignore', category=FutureWarning)
        warnings.filterwarnings('ignore', category=RuntimeWarning)

        self.model.fit(X, y, metric=metrics)

        warnings.resetwarnings()

    @time_logging
    def predict(self, test_csv: str, prediction_csv: str, validation_csv: str,
                need_proba: bool) -> pd.DataFrame:
        """Start model prediction
        Запуск процесса предсказывания целевого признака на новых данных

        Args:
            test_csv: Путь к тестовому датасету
            prediction_csv: Путь для записи ответов модели
            validation_csv: Путь к датасету правильных ответов на тестовой выборке (для подсчета метрики)
            need_proba: Флаг необходимости выдавать вероятностные предсказания

        Returns:
            Датасет с ответами модели
        """
        if not os.path.exists(test_csv):
            log('Data file {} is not exist!'.format(test_csv))
            return

        self.config['task'] = 'predict'

        df = read_df(test_csv, self.config)
        process_dataframe(df, self.config)

        predictions_df = self.model_predict(df, prediction_csv, need_proba)

        if validation_csv != 'None':
            self.model_validate(predictions_df, validation_csv)

    @time_logging
    def model_predict(self, X: pd.DataFrame, prediction_csv: str,
                      need_proba: bool) -> pd.DataFrame:
        """Model predict wrapper
        Обертка для вызова predict

        Args:
            X: Матрица признаков
            prediction_csv: Путь для записи ответов модели
            need_proba: Флаг необходимости выдавать вероятностные предсказания
        """
        if (self.config['mode'] == 'classification') and need_proba:
            predictions = self.model.predict_proba(X, n_jobs=-1)
            df_columns = ['target_0', 'target_1']
        else:
            predictions = self.model.predict(X, n_jobs=-1)
            df_columns = ['target']

        # подготовка каталога для записи ответов
        output_dir = '/'.join(prediction_csv.split('/')[:-1])
        os.makedirs(output_dir, exist_ok=True)

        # запись датафрейма с ответами
        predictions_df = pd.DataFrame(predictions,
                                      index=X.index,
                                      columns=df_columns)
        predictions_df.to_csv(prediction_csv)

        return predictions_df

    @time_logging
    def model_validate(self, predictions_df: pd.DataFrame,
                       validation_csv: str) -> None:
        """Model validate
        Валидирование модели по известным правильным ответам

        Args:
            prediction_csv: Путь для записи ответов модели
            validation_csv: Путь к датасету правильных ответов на тестовой выборке
        """
        if self.metrics_object is None:
            log('Can\'t get the metrics object!')
            return

        if not os.path.exists(validation_csv):
            log('Validation file {} is not exist!'.format(validation_csv))
            return

        # чтение датасета с правильными ответами
        validation_df = pd.read_csv(validation_csv, encoding='utf-8', sep=',')

        # объединение правильных и предсказанных ответов для соответствия по индексам
        compare_df = pd.merge(validation_df, predictions_df, on="line_id")

        # подсчет score
        # в объединенном датасете будут следующий индексы столбцов:
        # 0: index, 1: true values, 2-...: predicted values
        score = self.metrics_object(compare_df.iloc[:, 1].values,
                                    compare_df.iloc[:, 2:].values)
        log('Metrics: {}'.format(self.metrics_object))
        log('Score: {}'.format(score))

        return score

    @time_logging
    def save(self) -> None:
        """Save model, parameters and metrics object
        Сохранение на диск модели, параметров и объекта метрики
        """
        self.config.save()
        with open(os.path.join(self.config['model_dir'], 'model.pkl'),
                  'wb') as f:
            pickle.dump(self.model, f, protocol=pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'),
                  'wb') as f:
            pickle.dump(self.metrics_object,
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)

    @time_logging
    def load(self) -> None:
        """Load model, parameters and metrics object
        Загрузка с диска модели, параметров и объекта метрики
        """
        self.config.load()
        with open(os.path.join(self.config['model_dir'], 'model.pkl'),
                  'rb') as f:
            self.model = pickle.load(f)
        with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'),
                  'rb') as f:
            self.metrics_object = pickle.load(f)

    def __repr__(self) -> str:
        repr_string = 'AutoSklearnSolver\n'
        repr_string += '-----------------\n'
        repr_string += str(self.config)
        return repr_string
Exemplo n.º 28
0
 
print(f'[INFO] Train shape: {X_train.shape}')
print(f'[INFO] Test shape: {X_test.shape}')
 
print('[INFO] Finding best model...')
#for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0
#-----CLASSIFIER-----
#automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv',
#        resampling_strategy_arguments={'folds': 5})
#-----REGRESSION-----
automl = AutoSklearnRegressor(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5})
start = time.time()
 
#X_train = X_train.astype('float')
automl.fit(X_train, y_train, dataset_name='linnerud')   #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.') 

predictions = automl.predict(X_test)
#print('--- CLASSIFICATION REPORT: ---')        #not for regression
#print(classification_report(y_test, predictions, digits=5))
print('\n\n--- MODELS: ---')
print(automl.show_models())
print('\n\n--- STATISTICS: ---')
print(automl.sprint_statistics()) 

#-----CLASSIFIER-----
#print('\n\n--- SCORE: ---')
#print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions))
#-----REGRESSION-----
Exemplo n.º 29
0
]
preprocessing_to_use = ["no_preprocessing"]

# Init auto-sklearn
auto_sklearn = AutoSklearnRegressor(time_left_for_this_task=60 * 5,
                                    per_run_time_limit=360,
                                    include_estimators=estimators_to_use,
                                    exclude_estimators=None,
                                    include_preprocessors=preprocessing_to_use,
                                    exclude_preprocessors=None,
                                    ml_memory_limit=6156,
                                    resampling_strategy="cv",
                                    resampling_strategy_arguments={"folds": 5})

# Train models
auto_sklearn.fit(X=X_train.copy(), y=y_train.copy(), metric=mean_squared_error)
it_fits = auto_sklearn.refit(X=X_train.copy(), y=y_train.copy())

# Predict
y_hat = auto_sklearn.predict(X_test)

# Show results
auto_sklearn.cv_results_
auto_sklearn.sprint_statistics()
auto_sklearn.show_models()
auto_sklearn.get_models_with_weights()

# TPOT

from tpot import TPOTRegressor