示例#1
0
    def test_regression_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=41514,  # diabetes
            return_X_y=True,
            as_frame=True,
        )
        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnRegressor(
            time_left_for_this_task=30,
            per_run_time_limit=5,
        )

        # Make sure we error out because y is not encoded
        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.5)

        automl.refit(X, y)

        # Make sure that at least better than random.
        self.assertTrue(r2(y, automl.predict(X)) > 0.5)
    def test_cv_regression(self):
        """
        Makes sure that when using a cv strategy, we are able to fit
        a regressor
        """
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'boston', train_size_maximum=300)
        automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                      per_run_time_limit=10,
                                      resampling_strategy='cv',
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (206, ))
        score = r2(Y_test, predictions)
        print(Y_test)
        print(predictions)
        self.assertGreaterEqual(score, 0.1)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)

        self._tearDown(tmp)
        self._tearDown(output)
示例#3
0
    def test_cv_regression(self):
        """
        Makes sure that when using a cv strategy, we are able to fit
        a regressor
        """
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                      per_run_time_limit=5,
                                      resampling_strategy='cv',
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356,))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
        # constraint. With more time_left_for_this_task this is no longer an issue
        self.assertGreaterEqual(score, -37)

        self._tearDown(tmp)
        self._tearDown(output)
示例#4
0
class AutoSklearnRegressorEnsemble(AutoSklearnModel, Ensemble):
    """
    Wrapper around an autosklearn model.
    """
    _kind: ModelType = 'regressor'

    def __init__(self, **kwargs) -> None:
        Ensemble.__init__(self)
        client = Client(processes=False,
                        n_workers=kwargs['n_jobs'],
                        thread_per_worker=1,
                        dashboard_address=None)
        self.model = AutoSklearnRegressor(**kwargs, dask_client=client)

    def autosklearn_model(self) -> AutoSklearnRegressor:
        return self.model

    def predict(self, X: np.ndarray) -> np.ndarray:
        """ Get the models prediction """
        return self.model.predict(X)

    def model_predictions(self, X: np.ndarray) -> np.ndarray:
        """ Get the models probability predicitons """
        return np.asarray([m.predict(X) for m in self.models()])

    @classmethod
    def kind(cls) -> ModelType:
        return cls._kind
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    # Log file path
    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, extract_msg_from_log(log_file_path)
    assert count_succeses(
        automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
示例#6
0
def test_regression_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=41514,  # diabetes
        return_X_y=True,
        as_frame=True,
    )
    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=40,
        per_run_time_limit=5,
        dask_client=dask_client,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    # Make sure we error out because y is not encoded
    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) >= 0.5, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
示例#7
0
    def test_regression(self):
        output = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=output,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356, ))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
示例#8
0
    def test_regression(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356,))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
示例#9
0
class AutoML(AbstractModel):
    def __init__(self):
        super().__init__()
        self.model = AutoSklearnRegressor

    def fit(self, x, y, modeldict=None):
        if not self.m:
            self.param_search(x, y)
        self.m.refit(x, y)

    def param_search(self, x, y, time_per_sample=3.5, **kwargs):
        time = int(len(y) * time_per_sample)
        self.m = AutoSklearnRegressor(
            time_left_for_this_task=time,
            resampling_strategy="cv",
            resampling_strategy_arguments={'folds': 10})

        self.m.fit(x,
                   y,
                   metric=mean_squared_error,
                   dataset_name="Land Use Regression")
        # print(self.m.sprint_statistics())
        # score = score_funtion(y, self.m.predict(x))
        # print("Reached a score of {}.".format(score))

        kf = KFold(n_splits=10, shuffle=True)
        rmse = []
        mae = []
        r2 = []
        for train_index, test_index in kf.split(x, y):
            X_train, X_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.m.refit(X_train, y_train)
            predictions = self.m.predict(X_test)
            rmse_iter, mae_iter, r2_iter = self.score_function(
                y_test, predictions)
            rmse.append(rmse_iter)
            mae.append(mae_iter)
            r2.append(r2_iter)

        # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2)))

        return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
示例#10
0
def test_regression(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (356, )
    score = mean_squared_error(Y_test, predictions)

    # On average np.sqrt(30) away from the target -> ~5.5 on average
    # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
    # constraint. With more time_left_for_this_task this is no longer an issue
    assert score >= -37, print_debug_information(automl)
    assert count_succeses(automl.cv_results_) > 0
示例#11
0
def test_cv_regression(tmp_dir, output_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(time_left_for_this_task=60,
                                  per_run_time_limit=10,
                                  resampling_strategy='cv',
                                  tmp_folder=tmp_dir,
                                  dask_client=dask_client,
                                  output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
示例#12
0
    ml_memory_limit=1024 * 8,
    time_left_for_this_task=3600,
    resampling_strategy='cv',
    #                              ensemble_size=1,
    #                              initial_configurations_via_metalearning=0,
    resampling_strategy_arguments={'folds': 5})
start = time.time()

#X_train = X_train.astype('float') # when?
automl.fit(X_train, y_train,
           dataset_name='boston_housing')  #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(
    '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() -
                                                                 start))

predictions = automl.predict(X_test)
#print('--- CLASSIFICATION REPORT: ---')        #not for regression
#print(classification_report(y_test, predictions, digits=5))
print('\n\n--- MODELS: ---')
print(automl.show_models())
print('\n\n--- STATISTICS: ---')
print(automl.sprint_statistics())

#-----CLASSIFIER-----
#print('\n\n--- SCORE: ---')
#print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions))

#-----REGRESSION-----
print('\n\n--- SCORE: ---')
print("R2 score", r2_score(y_test, predictions))
示例#13
0
 dataframe = read_csv(address)
 print(time.strftime("Start time is %Y-%m-%d %H:%M:%S", time.localtime()))
 # split into input and output elements
 data = dataframe.values
 data = data.astype('int')
 X, y = data[:, :-1], data[:, -1]
 print(X.shape, y.shape)
 # split into train and test sets
 X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.33,
                                                     random_state=1)
 # define search
 model = AutoSklearnRegressor(time_left_for_this_task=5 * 60,
                              per_run_time_limit=30,
                              n_jobs=8)
 # perform the search
 model.fit(X_train, y_train)
 # summarize
 # print(model.sprint_statistics())
 # evaluate best model
 y_hat = model.predict(X_test)
 mae = mean_absolute_error(y_test, y_hat)
 r2Score = r2_score(y_test, y_hat)
 mape = mean_absolute_percentage_error(y_test, y_hat)
 mse = mean_squared_error(y_test, y_hat)
 print(time.strftime("End time is %Y-%m-%d %H:%M:%S", time.localtime()))
 print("MAE: %.3f" % mae)
 print("R2_score: %.3f" % r2Score)
 print("MAPE: %.3f" % mape)
 print("MSE: %.3f" % mse)
class AutoSklearnSolver:
    """ Model implementing through auto-sklearn.
    https://github.com/automl/auto-sklearn
    Класс реализует работу модели через функциональность auto-sklearn.

    Args:
        model_dir: Путь к директории модели
        time_limit: Временной лимит на обучение модели (с)
        memory_limit: Лимит на объем используемой памяти (Мб)

    Attributes:
        model_dir (str): Путь к каталогу модели
        config (Config): Параметры модели
        model ([AutoSklearnClassifier, AutoSklearnRegressor]): Объект модели auto-sklearn
        per_run_time_limit (int): Временной лимит на обучение модели
        metrics_object (autosklearn.metrics): Объект метрики качества содели
        procesed_data_path (str): Путь сохранения обработанных данных
    """
    def __init__(self,
                 model_dir: str,
                 time_limit: int = 0,
                 memory_limit: int = 0) -> None:
        os.makedirs(model_dir, exist_ok=True)
        self.model_dir = model_dir
        self.config = Config(model_dir, time_limit, memory_limit)
        self.model = None
        self.per_run_time_limit = min(360, time_limit // 2)

    @time_logging
    def fit(self, train_csv: str, mode: str, metrics_name: str,
            save_processed_data: bool) -> None:
        """Start model fitting
        Запуск процесса обучения модели

        Args:
            train_csv: Путь к обучающему датасету
            mode: Режим работы (классификация или регрессия)
            metrics_name: Имя объекта метрики качества в модуле autosklearn.metrics
            save_processed_data: Флаг сохранения датасета с обработанными данными
        """
        if not os.path.exists(train_csv):
            log('Data file {} is not exist!'.format(train_csv))
            return

        # получение объекта метрики
        try:
            self.metrics_object = getattr(metrics, metrics_name)
        except AttributeError as error:
            self.metrics_object = None
            log('Can\'t get the metrics object!')
            log('{}: {}'.format(type(error).__name__, error))
            return

        # подготовка каталога для сохранения данных
        if save_processed_data:
            self.procesed_data_path = os.path.join(self.model_dir,
                                                   'processed_data')
            os.makedirs(self.procesed_data_path, exist_ok=True)

        self.config['task'] = 'fit'
        self.config['mode'] = mode
        self.config['tmp_dir'] = self.config['model_dir'] + '/tmp'

        # удаление временной директории
        # (auto-sklearn ругается перед началом работы, если этого не делать)
        shutil.rmtree(self.config['tmp_dir'], ignore_errors=True)

        # первичный анализ, чтение данных, разбитие на матрицы X и y
        df = read_df(train_csv, self.config)
        y = df['target']
        X = df.drop('target', axis=1)

        # обработка данных
        process_dataframe(X, self.config)

        if save_processed_data:
            log('Saving processed data')
            X.to_csv(os.path.join(self.procesed_data_path, 'X.csv'))
            y.to_csv(os.path.join(self.procesed_data_path, 'y.csv'))

        # параметры создаваемой auto-sklearn модели
        # (выключаем препроцессинг, т.к. он уже проведен)
        model_params = {
            'time_left_for_this_task': self.config.time_left(),
            'per_run_time_limit': self.per_run_time_limit,
            'ml_memory_limit': self.config['memory_limit'],
            'tmp_folder': self.config['tmp_dir'],
            'include_preprocessors': ['no_preprocessing'],
            'delete_tmp_folder_after_terminate': True
        }

        # инициализация объекта модели
        self.model_init(model_params)

        # обучение модели
        self.model_fit(X, y, self.metrics_object)

        log('model_fitted: {}'.format(type(self.model)))
        log('autosklearn model contains:')
        log(self.model.show_models())

    @time_logging
    def model_init(self, model_params: Dict[str, Any]) -> None:
        """Model initialization
        Инициализация объекта модели в зависимости от типа задачи

        Args:
            model_params: Словарь параметров модели
        """
        if self.config['mode'] == 'classification':
            self.model = AutoSklearnClassifier(**model_params)
        elif self.config['mode'] == 'regression':
            self.model = AutoSklearnRegressor(**model_params)

    @time_logging
    def model_fit(self, X: pd.DataFrame, y: pd.Series,
                  metrics: Callable) -> None:
        """Model fitting wrapper
        Обертка для вызова fit (для учета времени в логе)

        Args:
            X: Матрица признаков
            y: Вектор ответов
            metrics: Объект метрики качества
        """
        # подавляем вывод предупреждений в лог
        warnings.filterwarnings('ignore', category=FutureWarning)
        warnings.filterwarnings('ignore', category=RuntimeWarning)

        self.model.fit(X, y, metric=metrics)

        warnings.resetwarnings()

    @time_logging
    def predict(self, test_csv: str, prediction_csv: str, validation_csv: str,
                need_proba: bool) -> pd.DataFrame:
        """Start model prediction
        Запуск процесса предсказывания целевого признака на новых данных

        Args:
            test_csv: Путь к тестовому датасету
            prediction_csv: Путь для записи ответов модели
            validation_csv: Путь к датасету правильных ответов на тестовой выборке (для подсчета метрики)
            need_proba: Флаг необходимости выдавать вероятностные предсказания

        Returns:
            Датасет с ответами модели
        """
        if not os.path.exists(test_csv):
            log('Data file {} is not exist!'.format(test_csv))
            return

        self.config['task'] = 'predict'

        df = read_df(test_csv, self.config)
        process_dataframe(df, self.config)

        predictions_df = self.model_predict(df, prediction_csv, need_proba)

        if validation_csv != 'None':
            self.model_validate(predictions_df, validation_csv)

    @time_logging
    def model_predict(self, X: pd.DataFrame, prediction_csv: str,
                      need_proba: bool) -> pd.DataFrame:
        """Model predict wrapper
        Обертка для вызова predict

        Args:
            X: Матрица признаков
            prediction_csv: Путь для записи ответов модели
            need_proba: Флаг необходимости выдавать вероятностные предсказания
        """
        if (self.config['mode'] == 'classification') and need_proba:
            predictions = self.model.predict_proba(X, n_jobs=-1)
            df_columns = ['target_0', 'target_1']
        else:
            predictions = self.model.predict(X, n_jobs=-1)
            df_columns = ['target']

        # подготовка каталога для записи ответов
        output_dir = '/'.join(prediction_csv.split('/')[:-1])
        os.makedirs(output_dir, exist_ok=True)

        # запись датафрейма с ответами
        predictions_df = pd.DataFrame(predictions,
                                      index=X.index,
                                      columns=df_columns)
        predictions_df.to_csv(prediction_csv)

        return predictions_df

    @time_logging
    def model_validate(self, predictions_df: pd.DataFrame,
                       validation_csv: str) -> None:
        """Model validate
        Валидирование модели по известным правильным ответам

        Args:
            prediction_csv: Путь для записи ответов модели
            validation_csv: Путь к датасету правильных ответов на тестовой выборке
        """
        if self.metrics_object is None:
            log('Can\'t get the metrics object!')
            return

        if not os.path.exists(validation_csv):
            log('Validation file {} is not exist!'.format(validation_csv))
            return

        # чтение датасета с правильными ответами
        validation_df = pd.read_csv(validation_csv, encoding='utf-8', sep=',')

        # объединение правильных и предсказанных ответов для соответствия по индексам
        compare_df = pd.merge(validation_df, predictions_df, on="line_id")

        # подсчет score
        # в объединенном датасете будут следующий индексы столбцов:
        # 0: index, 1: true values, 2-...: predicted values
        score = self.metrics_object(compare_df.iloc[:, 1].values,
                                    compare_df.iloc[:, 2:].values)
        log('Metrics: {}'.format(self.metrics_object))
        log('Score: {}'.format(score))

        return score

    @time_logging
    def save(self) -> None:
        """Save model, parameters and metrics object
        Сохранение на диск модели, параметров и объекта метрики
        """
        self.config.save()
        with open(os.path.join(self.config['model_dir'], 'model.pkl'),
                  'wb') as f:
            pickle.dump(self.model, f, protocol=pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'),
                  'wb') as f:
            pickle.dump(self.metrics_object,
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)

    @time_logging
    def load(self) -> None:
        """Load model, parameters and metrics object
        Загрузка с диска модели, параметров и объекта метрики
        """
        self.config.load()
        with open(os.path.join(self.config['model_dir'], 'model.pkl'),
                  'rb') as f:
            self.model = pickle.load(f)
        with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'),
                  'rb') as f:
            self.metrics_object = pickle.load(f)

    def __repr__(self) -> str:
        repr_string = 'AutoSklearnSolver\n'
        repr_string += '-----------------\n'
        repr_string += str(self.config)
        return repr_string
示例#15
0
              '_lead' + str(l))

    # summarize
    file = open(
        'log_files/' + ml_type + '_t' + str(time_left_for_this_task) +
        '_lead' + str(l) + '.txt', 'w')

    file.write(model.sprint_statistics())
    file.write('\n')
    file.write(model.show_models())
    file.close()

    print(model.sprint_statistics())
    print(model.show_models())
    # evaluate best model
    y_hat = model.predict(X_val)
    metric = calc_metrics(y_val, y_hat, ml_type)
    if ml_type == 'regression':
        metrics[l] = metric
    elif ml_type == 'classification':
        metrics[l, :] = metric
    print("************************************")
    print("lead:" + str(l) + ", metric: " + str(metric))
    print("************************************")

print("**************************")
print(metrics)
np.save(
    outpath + "automl_accuracy_detrend_t" + str(time_left_for_this_task) +
    "_" + ml_type + ".npy", metrics)
示例#16
0
auto_sklearn = AutoSklearnRegressor(time_left_for_this_task=60 * 5,
                                    per_run_time_limit=360,
                                    include_estimators=estimators_to_use,
                                    exclude_estimators=None,
                                    include_preprocessors=preprocessing_to_use,
                                    exclude_preprocessors=None,
                                    ml_memory_limit=6156,
                                    resampling_strategy="cv",
                                    resampling_strategy_arguments={"folds": 5})

# Train models
auto_sklearn.fit(X=X_train.copy(), y=y_train.copy(), metric=mean_squared_error)
it_fits = auto_sklearn.refit(X=X_train.copy(), y=y_train.copy())

# Predict
y_hat = auto_sklearn.predict(X_test)

# Show results
auto_sklearn.cv_results_
auto_sklearn.sprint_statistics()
auto_sklearn.show_models()
auto_sklearn.get_models_with_weights()

# TPOT

from tpot import TPOTRegressor

tpot_config = {
    "sklearn.linear_model.Ridge": {},
    "sklearn.ensemble.RandomForestClassifier": {},
    "sklearn.ensemble.ExtraTreesClassifier": {},