def test_regression_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.5) automl.refit(X, y) # Make sure that at least better than random. self.assertTrue(r2(y, automl.predict(X)) > 0.5)
def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv') output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (206, )) score = r2(Y_test, predictions) print(Y_test) print(predictions) self.assertGreaterEqual(score, 0.1) self.assertGreater(self._count_succeses(automl.cv_results_), 0) self._tearDown(tmp) self._tearDown(output)
def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') output = os.path.join(self.test_dir, '..', '.out_regression_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue self.assertGreaterEqual(score, -37) self._tearDown(tmp) self._tearDown(output)
class AutoSklearnRegressorEnsemble(AutoSklearnModel, Ensemble): """ Wrapper around an autosklearn model. """ _kind: ModelType = 'regressor' def __init__(self, **kwargs) -> None: Ensemble.__init__(self) client = Client(processes=False, n_workers=kwargs['n_jobs'], thread_per_worker=1, dashboard_address=None) self.model = AutoSklearnRegressor(**kwargs, dask_client=client) def autosklearn_model(self) -> AutoSklearnRegressor: return self.model def predict(self, X: np.ndarray) -> np.ndarray: """ Get the models prediction """ return self.model.predict(X) def model_predictions(self, X: np.ndarray) -> np.ndarray: """ Get the models probability predicitons """ return np.asarray([m.predict(X) for m in self.models()]) @classmethod def kind(cls) -> ModelType: return cls._kind
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) # Log file path log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, extract_msg_from_log(log_file_path) assert count_succeses( automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
def test_regression_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, output_folder=output_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_regression(self): output = os.path.join(self.test_dir, '..', '.tmp_regression_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356, )) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average self.assertGreaterEqual(score, -30)
def test_regression(self): tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') output = os.path.join(self.test_dir, '..', '.out_regression_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average self.assertGreaterEqual(score, -30)
class AutoML(AbstractModel): def __init__(self): super().__init__() self.model = AutoSklearnRegressor def fit(self, x, y, modeldict=None): if not self.m: self.param_search(x, y) self.m.refit(x, y) def param_search(self, x, y, time_per_sample=3.5, **kwargs): time = int(len(y) * time_per_sample) self.m = AutoSklearnRegressor( time_left_for_this_task=time, resampling_strategy="cv", resampling_strategy_arguments={'folds': 10}) self.m.fit(x, y, metric=mean_squared_error, dataset_name="Land Use Regression") # print(self.m.sprint_statistics()) # score = score_funtion(y, self.m.predict(x)) # print("Reached a score of {}.".format(score)) kf = KFold(n_splits=10, shuffle=True) rmse = [] mae = [] r2 = [] for train_index, test_index in kf.split(x, y): X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] self.m.refit(X_train, y_train) predictions = self.m.predict(X_test) rmse_iter, mae_iter, r2_iter = self.score_function( y_test, predictions) rmse.append(rmse_iter) mae.append(mae_iter) r2.append(r2_iter) # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2))) return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
def test_regression(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (356, ) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue assert score >= -37, print_debug_information(automl) assert count_succeses(automl.cv_results_) > 0
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
ml_memory_limit=1024 * 8, time_left_for_this_task=3600, resampling_strategy='cv', # ensemble_size=1, # initial_configurations_via_metalearning=0, resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') # when? automl.fit(X_train, y_train, dataset_name='boston_housing') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print( '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() - start)) predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression #print(classification_report(y_test, predictions, digits=5)) print('\n\n--- MODELS: ---') print(automl.show_models()) print('\n\n--- STATISTICS: ---') print(automl.sprint_statistics()) #-----CLASSIFIER----- #print('\n\n--- SCORE: ---') #print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions)) #-----REGRESSION----- print('\n\n--- SCORE: ---') print("R2 score", r2_score(y_test, predictions))
dataframe = read_csv(address) print(time.strftime("Start time is %Y-%m-%d %H:%M:%S", time.localtime())) # split into input and output elements data = dataframe.values data = data.astype('int') X, y = data[:, :-1], data[:, -1] print(X.shape, y.shape) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = AutoSklearnRegressor(time_left_for_this_task=5 * 60, per_run_time_limit=30, n_jobs=8) # perform the search model.fit(X_train, y_train) # summarize # print(model.sprint_statistics()) # evaluate best model y_hat = model.predict(X_test) mae = mean_absolute_error(y_test, y_hat) r2Score = r2_score(y_test, y_hat) mape = mean_absolute_percentage_error(y_test, y_hat) mse = mean_squared_error(y_test, y_hat) print(time.strftime("End time is %Y-%m-%d %H:%M:%S", time.localtime())) print("MAE: %.3f" % mae) print("R2_score: %.3f" % r2Score) print("MAPE: %.3f" % mape) print("MSE: %.3f" % mse)
class AutoSklearnSolver: """ Model implementing through auto-sklearn. https://github.com/automl/auto-sklearn Класс реализует работу модели через функциональность auto-sklearn. Args: model_dir: Путь к директории модели time_limit: Временной лимит на обучение модели (с) memory_limit: Лимит на объем используемой памяти (Мб) Attributes: model_dir (str): Путь к каталогу модели config (Config): Параметры модели model ([AutoSklearnClassifier, AutoSklearnRegressor]): Объект модели auto-sklearn per_run_time_limit (int): Временной лимит на обучение модели metrics_object (autosklearn.metrics): Объект метрики качества содели procesed_data_path (str): Путь сохранения обработанных данных """ def __init__(self, model_dir: str, time_limit: int = 0, memory_limit: int = 0) -> None: os.makedirs(model_dir, exist_ok=True) self.model_dir = model_dir self.config = Config(model_dir, time_limit, memory_limit) self.model = None self.per_run_time_limit = min(360, time_limit // 2) @time_logging def fit(self, train_csv: str, mode: str, metrics_name: str, save_processed_data: bool) -> None: """Start model fitting Запуск процесса обучения модели Args: train_csv: Путь к обучающему датасету mode: Режим работы (классификация или регрессия) metrics_name: Имя объекта метрики качества в модуле autosklearn.metrics save_processed_data: Флаг сохранения датасета с обработанными данными """ if not os.path.exists(train_csv): log('Data file {} is not exist!'.format(train_csv)) return # получение объекта метрики try: self.metrics_object = getattr(metrics, metrics_name) except AttributeError as error: self.metrics_object = None log('Can\'t get the metrics object!') log('{}: {}'.format(type(error).__name__, error)) return # подготовка каталога для сохранения данных if save_processed_data: self.procesed_data_path = os.path.join(self.model_dir, 'processed_data') os.makedirs(self.procesed_data_path, exist_ok=True) self.config['task'] = 'fit' self.config['mode'] = mode self.config['tmp_dir'] = self.config['model_dir'] + '/tmp' # удаление временной директории # (auto-sklearn ругается перед началом работы, если этого не делать) shutil.rmtree(self.config['tmp_dir'], ignore_errors=True) # первичный анализ, чтение данных, разбитие на матрицы X и y df = read_df(train_csv, self.config) y = df['target'] X = df.drop('target', axis=1) # обработка данных process_dataframe(X, self.config) if save_processed_data: log('Saving processed data') X.to_csv(os.path.join(self.procesed_data_path, 'X.csv')) y.to_csv(os.path.join(self.procesed_data_path, 'y.csv')) # параметры создаваемой auto-sklearn модели # (выключаем препроцессинг, т.к. он уже проведен) model_params = { 'time_left_for_this_task': self.config.time_left(), 'per_run_time_limit': self.per_run_time_limit, 'ml_memory_limit': self.config['memory_limit'], 'tmp_folder': self.config['tmp_dir'], 'include_preprocessors': ['no_preprocessing'], 'delete_tmp_folder_after_terminate': True } # инициализация объекта модели self.model_init(model_params) # обучение модели self.model_fit(X, y, self.metrics_object) log('model_fitted: {}'.format(type(self.model))) log('autosklearn model contains:') log(self.model.show_models()) @time_logging def model_init(self, model_params: Dict[str, Any]) -> None: """Model initialization Инициализация объекта модели в зависимости от типа задачи Args: model_params: Словарь параметров модели """ if self.config['mode'] == 'classification': self.model = AutoSklearnClassifier(**model_params) elif self.config['mode'] == 'regression': self.model = AutoSklearnRegressor(**model_params) @time_logging def model_fit(self, X: pd.DataFrame, y: pd.Series, metrics: Callable) -> None: """Model fitting wrapper Обертка для вызова fit (для учета времени в логе) Args: X: Матрица признаков y: Вектор ответов metrics: Объект метрики качества """ # подавляем вывод предупреждений в лог warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=RuntimeWarning) self.model.fit(X, y, metric=metrics) warnings.resetwarnings() @time_logging def predict(self, test_csv: str, prediction_csv: str, validation_csv: str, need_proba: bool) -> pd.DataFrame: """Start model prediction Запуск процесса предсказывания целевого признака на новых данных Args: test_csv: Путь к тестовому датасету prediction_csv: Путь для записи ответов модели validation_csv: Путь к датасету правильных ответов на тестовой выборке (для подсчета метрики) need_proba: Флаг необходимости выдавать вероятностные предсказания Returns: Датасет с ответами модели """ if not os.path.exists(test_csv): log('Data file {} is not exist!'.format(test_csv)) return self.config['task'] = 'predict' df = read_df(test_csv, self.config) process_dataframe(df, self.config) predictions_df = self.model_predict(df, prediction_csv, need_proba) if validation_csv != 'None': self.model_validate(predictions_df, validation_csv) @time_logging def model_predict(self, X: pd.DataFrame, prediction_csv: str, need_proba: bool) -> pd.DataFrame: """Model predict wrapper Обертка для вызова predict Args: X: Матрица признаков prediction_csv: Путь для записи ответов модели need_proba: Флаг необходимости выдавать вероятностные предсказания """ if (self.config['mode'] == 'classification') and need_proba: predictions = self.model.predict_proba(X, n_jobs=-1) df_columns = ['target_0', 'target_1'] else: predictions = self.model.predict(X, n_jobs=-1) df_columns = ['target'] # подготовка каталога для записи ответов output_dir = '/'.join(prediction_csv.split('/')[:-1]) os.makedirs(output_dir, exist_ok=True) # запись датафрейма с ответами predictions_df = pd.DataFrame(predictions, index=X.index, columns=df_columns) predictions_df.to_csv(prediction_csv) return predictions_df @time_logging def model_validate(self, predictions_df: pd.DataFrame, validation_csv: str) -> None: """Model validate Валидирование модели по известным правильным ответам Args: prediction_csv: Путь для записи ответов модели validation_csv: Путь к датасету правильных ответов на тестовой выборке """ if self.metrics_object is None: log('Can\'t get the metrics object!') return if not os.path.exists(validation_csv): log('Validation file {} is not exist!'.format(validation_csv)) return # чтение датасета с правильными ответами validation_df = pd.read_csv(validation_csv, encoding='utf-8', sep=',') # объединение правильных и предсказанных ответов для соответствия по индексам compare_df = pd.merge(validation_df, predictions_df, on="line_id") # подсчет score # в объединенном датасете будут следующий индексы столбцов: # 0: index, 1: true values, 2-...: predicted values score = self.metrics_object(compare_df.iloc[:, 1].values, compare_df.iloc[:, 2:].values) log('Metrics: {}'.format(self.metrics_object)) log('Score: {}'.format(score)) return score @time_logging def save(self) -> None: """Save model, parameters and metrics object Сохранение на диск модели, параметров и объекта метрики """ self.config.save() with open(os.path.join(self.config['model_dir'], 'model.pkl'), 'wb') as f: pickle.dump(self.model, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'), 'wb') as f: pickle.dump(self.metrics_object, f, protocol=pickle.HIGHEST_PROTOCOL) @time_logging def load(self) -> None: """Load model, parameters and metrics object Загрузка с диска модели, параметров и объекта метрики """ self.config.load() with open(os.path.join(self.config['model_dir'], 'model.pkl'), 'rb') as f: self.model = pickle.load(f) with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'), 'rb') as f: self.metrics_object = pickle.load(f) def __repr__(self) -> str: repr_string = 'AutoSklearnSolver\n' repr_string += '-----------------\n' repr_string += str(self.config) return repr_string
'_lead' + str(l)) # summarize file = open( 'log_files/' + ml_type + '_t' + str(time_left_for_this_task) + '_lead' + str(l) + '.txt', 'w') file.write(model.sprint_statistics()) file.write('\n') file.write(model.show_models()) file.close() print(model.sprint_statistics()) print(model.show_models()) # evaluate best model y_hat = model.predict(X_val) metric = calc_metrics(y_val, y_hat, ml_type) if ml_type == 'regression': metrics[l] = metric elif ml_type == 'classification': metrics[l, :] = metric print("************************************") print("lead:" + str(l) + ", metric: " + str(metric)) print("************************************") print("**************************") print(metrics) np.save( outpath + "automl_accuracy_detrend_t" + str(time_left_for_this_task) + "_" + ml_type + ".npy", metrics)
auto_sklearn = AutoSklearnRegressor(time_left_for_this_task=60 * 5, per_run_time_limit=360, include_estimators=estimators_to_use, exclude_estimators=None, include_preprocessors=preprocessing_to_use, exclude_preprocessors=None, ml_memory_limit=6156, resampling_strategy="cv", resampling_strategy_arguments={"folds": 5}) # Train models auto_sklearn.fit(X=X_train.copy(), y=y_train.copy(), metric=mean_squared_error) it_fits = auto_sklearn.refit(X=X_train.copy(), y=y_train.copy()) # Predict y_hat = auto_sklearn.predict(X_test) # Show results auto_sklearn.cv_results_ auto_sklearn.sprint_statistics() auto_sklearn.show_models() auto_sklearn.get_models_with_weights() # TPOT from tpot import TPOTRegressor tpot_config = { "sklearn.linear_model.Ridge": {}, "sklearn.ensemble.RandomForestClassifier": {}, "sklearn.ensemble.ExtraTreesClassifier": {},