def test_multivariate_ts(): forecast_length = 1 file_path_train = 'cases/data/metocean/metocean_data_train.csv' full_path_train = os.path.join(str(fedot_project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/metocean/metocean_data_test.csv' full_path_test = os.path.join(str(fedot_project_root()), file_path_test) target_history, add_history, obs = prepare_input_data( full_path_train, full_path_test) historical_data = { 'ws': add_history, # additional variable 'ssh': target_history, # target variable } fedot = Fedot( problem='ts_forecasting', composer_params=composer_params, task_params=TsForecastingParams(forecast_length=forecast_length)) fedot.fit(features=historical_data, target=target_history) forecast = fedot.forecast(historical_data, forecast_length=forecast_length) assert forecast is not None
def test_pandas_input_for_api(): train_data, test_data, threshold = get_dataset('classification') train_features = pd.DataFrame(train_data.features) train_target = pd.Series(train_data.target) test_features = pd.DataFrame(test_data.features) test_target = pd.Series(test_data.target) # task selection, initialisation of the framework baseline_model = Fedot(problem='classification') # fit model without optimisation - single XGBoost node is used baseline_model.fit(features=train_features, target=train_target, predefined_model='xgboost') # evaluate the prediction with test data prediction = baseline_model.predict(features=test_features) assert len(prediction) == len(test_target) # evaluate quality metric for the test sample baseline_metrics = baseline_model.get_metrics(metric_names='f1', target=test_target) assert baseline_metrics['f1'] > 0
def run_multi_output_case(path, vis=False): """ Function launch case for river levels prediction on Lena river as multi-output regression task :param path: path to the file with table :param vis: is it needed to visualise pipeline and predictions """ target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date']) train, test = train_test_data_setup(data) problem = 'regression' automl_model = Fedot(problem=problem) automl_model.fit(features=train) predicted_array = automl_model.predict(features=test) # Convert output into one dimensional array forecast = np.ravel(predicted_array) mae_value = mean_absolute_error(np.ravel(test.target), forecast) print(f'MAE - {mae_value:.2f}') if vis: plot_predictions(predicted_array, test)
def run_one_model_with_specific_evaluation_mod(train_data, test_data, mode: str = None): """ Runs the example with one model svc. :param train_data: train data for pipeline training :param test_data: test data for pipeline training :param mode: pass gpu flag to make gpu evaluation """ problem = 'classification' if mode == 'gpu': baseline_model = Fedot(problem=problem, preset='gpu') else: baseline_model = Fedot(problem=problem) svc_node_with_custom_params = PrimaryNode('svc') # the custom params are needed to make probability evaluation available # otherwise an error is occurred svc_node_with_custom_params.custom_params = dict(kernel='rbf', C=10, gamma=1, cache_size=2000, probability=True) preset_pipeline = Pipeline(svc_node_with_custom_params) start = datetime.now() baseline_model.fit(features=train_data, target='target', predefined_model=preset_pipeline) print(f'Completed with custom params in: {datetime.now() - start}') baseline_model.predict(features=test_data) print(baseline_model.get_metrics())
def test_classification_quality_improvement(): # input data initialization train_data_path = f'{project_root()}/cases/data/scoring/scoring_train.csv' test_data_path = f'{project_root()}/cases/data/scoring/scoring_test.csv' problem = 'classification' baseline_model = Fedot(problem=problem) baseline_model.fit(features=train_data_path, target='target', predefined_model='xgboost') expected_baseline_quality = 0.823 baseline_model.predict_proba(features=test_data_path) baseline_metrics = baseline_model.get_metrics() # Define parameters for composing composer_params = {'max_depth': 3, 'max_arity': 3, 'pop_size': 20, 'num_of_generations': 20, 'learning_time': 10, 'with_tuning': True} auto_model = Fedot(problem=problem, composer_params=composer_params, seed=42, verbose_level=4) auto_model.fit(features=train_data_path, target='target') auto_model.predict_proba(features=test_data_path) auto_metrics = auto_model.get_metrics() print(auto_metrics['roc_auc']) assert auto_metrics['roc_auc'] > baseline_metrics['roc_auc'] >= expected_baseline_quality
def test_multiobj_for_api(): train_data, test_data, _ = get_dataset('classification') composer_params['composer_metric'] = ['f1', 'node_num'] model = Fedot(problem='classification', composer_params=composer_params) model.fit(features=train_data) prediction = model.predict(features=test_data) metric = model.get_metrics() assert len(prediction) == len(test_data.target) assert metric['f1'] > 0 assert model.best_models is not None
def test_multi_target_regression_composing_correct(multi_target_data_setup): # Load simple dataset for multi-target train, test = multi_target_data_setup problem = 'regression' simple_composer_params = get_simple_composer_params() automl_model = Fedot(problem=problem, composer_params=simple_composer_params) automl_model.fit(features=train) predicted_array = automl_model.predict(features=test) assert predicted_array is not None
def test_api_forecast_correct(task_type: str = 'ts_forecasting'): # The forecast length must be equal to 12 forecast_length = 12 train_data, test_data, _ = get_dataset(task_type) model = Fedot(problem='ts_forecasting', composer_params=composer_params, task_params=TsForecastingParams(forecast_length=forecast_length)) model.fit(features=train_data) ts_forecast = model.predict(features=train_data) metric = model.get_metrics(target=test_data.target, metric_names='rmse') assert len(ts_forecast) == forecast_length assert metric['rmse'] >= 0
def run_ts_forecasting_example(with_plot=True, with_pipeline_vis=True, timeout=None): train_data_path = f'{fedot_project_root()}/examples/data/salaries.csv' target = pd.read_csv(train_data_path)['target'] # Define forecast length and define parameters - forecast length forecast_length = 30 task_parameters = TsForecastingParams(forecast_length=forecast_length) # init model for the time series forecasting model = Fedot(problem='ts_forecasting', task_params=task_parameters, timeout=timeout) # run AutoML model design in the same way pipeline = model.fit(features=train_data_path, target='target') if with_pipeline_vis: pipeline.show() # use model to obtain forecast forecast = model.predict(features=train_data_path) print( model.get_metrics(metric_names=['rmse', 'mae', 'mape'], target=target)) # plot forecasting result if with_plot: model.plot_prediction() return forecast
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, is_visualise=False, timeout=5): # Prepare data for train and test ssh_history, ws_history, ssh_obs = prepare_input_data( train_file_path, test_file_path) historical_data = { 'ws': ws_history, # additional variable 'ssh': ssh_history, # target variable } fedot = Fedot( problem='ts_forecasting', task_params=TsForecastingParams(forecast_length=forecast_length), timeout=timeout, verbose_level=4) pipeline = fedot.fit(features=historical_data, target=ssh_history) fedot.forecast(historical_data, forecast_length=forecast_length) metric = fedot.get_metrics(target=ssh_obs) if is_visualise: pipeline.show() fedot.plot_prediction() return metric
def test_api_forecast_numpy_input_with_static_model_correct(task_type: str = 'ts_forecasting'): forecast_length = 10 train_data, test_data, _ = get_dataset(task_type) model = Fedot(problem='ts_forecasting', task_params=TsForecastingParams(forecast_length=forecast_length)) # Define chain for prediction node_lagged = PrimaryNode('lagged') chain = Chain(SecondaryNode('linear', nodes_from=[node_lagged])) model.fit(features=train_data.features, target=train_data.target, predefined_model=chain) ts_forecast = model.predict(features=train_data) metric = model.get_metrics(target=test_data.target, metric_names='rmse') assert len(ts_forecast) == forecast_length assert metric['rmse'] >= 0
def test_baseline_with_api(): train_data, test_data, threshold = get_dataset('classification') # task selection, initialisation of the framework baseline_model = Fedot(problem='classification') # fit model without optimisation - single XGBoost node is used baseline_model.fit(features=train_data, target='target', predefined_model='xgboost') # evaluate the prediction with test data prediction = baseline_model.predict_proba(features=test_data) assert len(prediction) == len(test_data.target) # evaluate quality metric for the test sample baseline_metrics = baseline_model.get_metrics(metric_names='f1') assert baseline_metrics['f1'] > 0
def test_api_predict_correct(task_type: str = 'classification'): train_data, test_data, _ = get_dataset(task_type) model = Fedot(problem=task_type, composer_params=composer_params) fedot_model = model.fit(features=train_data) prediction = model.predict(features=test_data) metric = model.get_metrics() assert isinstance(fedot_model, Pipeline) assert len(prediction) == len(test_data.target) assert metric['f1'] > 0
def run_credit_scoring_problem(train_file_path, test_file_path, timeout: float = 5.0, is_visualise=False, with_tuning=False, cache_path=None): preset = 'light_tun' if with_tuning else 'light' automl = Fedot(problem='classification', timeout=timeout, verbose_level=4, preset=preset) automl.fit(train_file_path) predict = automl.predict(test_file_path) metrics = automl.get_metrics() if is_visualise: automl.current_pipeline.show() print(f'Composed ROC AUC is {round(metrics["roc_auc"], 3)}') return metrics["roc_auc"]
def run_classification_example(timeout=None): train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv' test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv' problem = 'classification' baseline_model = Fedot(problem=problem, timeout=timeout) baseline_model.fit(features=train_data_path, target='target', predefined_model='xgboost') baseline_model.predict(features=test_data_path) print(baseline_model.get_metrics()) auto_model = Fedot(problem=problem, seed=42, timeout=timeout) auto_model.fit(features=train_data_path, target='target') prediction = auto_model.predict_proba(features=test_data_path) print(auto_model.get_metrics()) return prediction
def run_regression_example(): data_path = f'{fedot_project_root()}/cases/data/cholesterol/cholesterol.csv' data = InputData.from_csv(data_path) train, test = train_test_data_setup(data) problem = 'regression' baseline_model = Fedot(problem=problem) baseline_model.fit(features=train, predefined_model='xgbreg') baseline_model.predict(features=test) print(baseline_model.get_metrics()) auto_model = Fedot(problem=problem, seed=42) auto_model.fit(features=train, target='target') prediction = auto_model.predict(features=test) print(auto_model.get_metrics()) return prediction
def run_pipeline_with_specific_evaluation_mode(train_data: InputData, test_data: InputData, mode: str = None): """ Runs the example with 3-node pipeline. :param train_data: train data for pipeline training :param test_data: test data for pipeline training :param mode: pass gpu flag to make gpu evaluation """ problem = 'classification' if mode == 'gpu': baseline_model = Fedot(problem=problem, preset='gpu') else: baseline_model = Fedot(problem=problem) svc_node_with_custom_params = PrimaryNode('svc') svc_node_with_custom_params.custom_params = dict(kernel='rbf', C=10, gamma=1, cache_size=2000, probability=True) logit_node = PrimaryNode('logit') rf_node = SecondaryNode( 'rf', nodes_from=[svc_node_with_custom_params, logit_node]) preset_pipeline = Pipeline(rf_node) start = datetime.now() baseline_model.fit(features=train_data, target='target', predefined_model=preset_pipeline) print(f'Completed with custom params in: {datetime.now() - start}') baseline_model.predict(features=test_data) print(baseline_model.get_metrics())
def run_classification_multiobj_example(with_plot=True): train_data = pd.read_csv( f'{project_root()}/examples/data/Hill_Valley_with_noise_Training.data') test_data = pd.read_csv( f'{project_root()}/examples/data/Hill_Valley_with_noise_Testing.data') target = test_data['class'] del test_data['class'] problem = 'classification' auto_model = Fedot(problem=problem, learning_time=2, preset='light', composer_params={'metric': ['f1', 'node_num']}, seed=42) auto_model.fit(features=train_data, target='class') prediction = auto_model.predict_proba(features=test_data) print(auto_model.get_metrics(target)) if with_plot: auto_model.best_models.show() return prediction
def make_forecast(df, len_forecast: int, time_series_label: str): """ Function for making time series forecasting with Prophet library :param df: dataframe to process :param len_forecast: forecast length :param time_series_label: name of time series to process :return predicted_values: forecast :return model_name: name of the model (always 'AutoTS') """ # Define parameters task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) # Init model for the time series forecasting model = Fedot(problem='ts_forecasting', task_params=task.task_params, composer_params={ 'timeout': 1, 'preset': 'ultra_light_tun' }, preset='ultra_light_tun') input_data = InputData(idx=np.arange(0, len(df)), features=np.array(df[time_series_label]), target=np.array(df[time_series_label]), task=task, data_type=DataTypesEnum.ts) start_forecast = len(df) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=np.array(df[time_series_label]), target=np.array(df[time_series_label]), task=task, data_type=DataTypesEnum.ts) # Run AutoML model design in the same way pipeline = model.fit(features=input_data) predicted_values = model.predict(predict_input) model_name = 'FEDOT' return predicted_values, model_name
def test_cv_api_correct(): composer_params = { 'max_depth': 1, 'max_arity': 2, 'timeout': 0.0001, 'preset': 'ultra_light', 'cv_folds': 10 } task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose, dataset_to_validate = get_data(task) model = Fedot(problem='classification', composer_params=composer_params, verbose_level=2) fedot_model = model.fit(features=dataset_to_compose) prediction = model.predict(features=dataset_to_validate) metric = model.get_metrics() assert isinstance(fedot_model, Pipeline) assert len(prediction) == len(dataset_to_validate.target) assert metric['f1'] > 0
def test_api_cv_correct(): """ Checks if the composer works correctly when using cross validation for time series through api """ folds = 2 _, forecast_len, validation_blocks, time_series = configure_experiment() composer_params = { 'max_depth': 1, 'max_arity': 2, 'timeout': 0.05, 'preset': 'ultra_light', 'cv_folds': folds, 'validation_blocks': validation_blocks } task_parameters = TsForecastingParams(forecast_length=forecast_len) model = Fedot(problem='ts_forecasting', composer_params=composer_params, task_params=task_parameters, verbose_level=2) fedot_model = model.fit(features=time_series) is_succeeded = True assert is_succeeded
def run_oil_forecasting(path_to_file, path_to_file_crm, len_forecast, len_forecast_full, ax, well_id, timeout): if timeout is None: timeout = 1 df = pd.read_csv(path_to_file, sep=' *, *') df_crm = pd.read_csv(path_to_file_crm, sep=' *, *') len_forecast_for_split = len_forecast_full dates, target_train, data_fit, data_predict, input_data_fit, input_data_predict, test_data, \ train_data, time_series = prepare_dataset(df, len_forecast, len_forecast_for_split, well_id) dates, target_train_crm, data_fit_crm, data_predict_crm, input_data_fit_crm, input_data_predict_crm, test_data_crm, \ train_data, time_series = prepare_dataset(df_crm, len_forecast, len_forecast_for_split, well_id) task_parameters = TsForecastingParams(forecast_length=len_forecast) if not os.path.exists(f'pipeline_{well_id}/pipeline_{well_id}.json'): model = Fedot(problem='ts_forecasting', task_params=task_parameters, composer_params={'timeout': timeout}, preset='ultra_light', verbose_level=4) # run AutoML model design in the same way pipeline = model.fit(features=data_fit, target=target_train) pipeline.save(f'pipeline_{well_id}') # , datetime_in_path=False) else: pipeline = Pipeline() pipeline.load(f'pipeline_{well_id}/pipeline_{well_id}.json') if not os.path.exists( f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json'): model = Fedot(problem='ts_forecasting', task_params=task_parameters, composer_params={'timeout': timeout}, preset='ultra_light', verbose_level=4) # run AutoML model design in the same way pipeline_crm = model.fit(features=data_fit_crm, target=target_train_crm) pipeline_crm.save( f'pipeline_crm_{well_id}') # , datetime_in_path=False) else: pipeline_crm = Pipeline() pipeline_crm.load( f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json') sources = dict( (f'data_source_ts/{data_part_key}', data_part) for (data_part_key, data_part) in input_data_predict.items()) input_data_predict_mm = MultiModalData(sources) sources_crm = dict( (f'data_source_ts/{data_part_key}', data_part) for (data_part_key, data_part) in input_data_predict_crm.items()) input_data_predict_mm_crm = MultiModalData(sources_crm) forecast = in_sample_ts_forecast(pipeline, input_data_predict_mm, horizon=len_forecast_full) forecast_crm = in_sample_ts_forecast(pipeline_crm, input_data_predict_mm_crm, horizon=len_forecast_full) predicted = np.ravel(np.array(forecast)) predicted_crm = np.ravel(np.array(forecast_crm)) predicted_only_crm = np.asarray( df_crm[f'crm_{well_id}'][-len_forecast_full:]) test_data = np.ravel(test_data) print('CRM') predicted_only_crm[np.isnan(predicted_only_crm)] = 0 mse_before = mean_squared_error(test_data, predicted_only_crm, squared=False) mae_before = mean_absolute_error(test_data, predicted_only_crm) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') print('ML') mse_before = mean_squared_error(test_data, predicted, squared=False) mae_before = mean_absolute_error(test_data, predicted) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') print('AutoML+CRM') mse_before = mean_squared_error(test_data, predicted_crm, squared=False) mae_before = mean_absolute_error(test_data, predicted_crm) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') if ax: x_for = range(len(train_data), len(time_series)) ax.plot(x_for, time_series[-len_forecast_full:], label='Actual time series', linewidth=0.5) ax.plot(x_for, predicted_crm, label='AutoML+CRM', linewidth=0.5) ax.plot(x_for, predicted_only_crm, label='CRM', linewidth=0.5) ci_crm = t_conf_interval(np.std(predicted_crm), 0.975, len(predicted_crm)) * 1.96 ax.fill_between(x_for, (predicted_crm - ci_crm), (predicted_crm + ci_crm), color='orange', alpha=.5) ci_crmonly = t_conf_interval(np.std(predicted_only_crm), 0.975, len(predicted_only_crm)) * 1.96 ax.fill_between(x_for, (predicted_only_crm - ci_crmonly), (predicted_only_crm + ci_crmonly), color='green', alpha=.5) ax.set(xlabel='Days from 2013.06.01', ylabel='Oil volume, m3') if well_id == '5351': ax.legend() ax.set_title(well_id) ax.plot()
def run_additional_learning_example(): train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv' test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv' train_data = pd.read_csv(train_data_path) test_data = pd.read_csv(test_data_path) test_data_target = test_data['target'] del test_data['target'] problem = 'classification' auto_model = Fedot(problem=problem, seed=42, preset='light', timeout=5, composer_params={ 'initial_pipeline': Pipeline( SecondaryNode( 'logit', nodes_from=[PrimaryNode('scaling')])) }) auto_model.fit(features=deepcopy(train_data.head(1000)), target='target') auto_model.predict_proba(features=deepcopy(test_data)) print('auto_model', auto_model.get_metrics(target=deepcopy(test_data_target))) prev_model = auto_model.current_pipeline prev_model.show() prev_model.unfit() atomized_model = Pipeline( SecondaryNode(operation_type=AtomizedModel(prev_model), nodes_from=[PrimaryNode('scaling')])) non_atomized_model = deepcopy(prev_model) train_data = train_data.head(5000) timeout = 1 auto_model_from_atomized = Fedot( problem=problem, seed=42, preset='light', timeout=timeout, composer_params={'initial_pipeline': atomized_model}, verbose_level=2) auto_model_from_atomized.fit(features=deepcopy(train_data), target='target') auto_model_from_atomized.predict_proba(features=deepcopy(test_data)) auto_model_from_atomized.current_pipeline.show() print('auto_model_from_atomized', auto_model_from_atomized.get_metrics(deepcopy(test_data_target))) auto_model_from_pipeline = Fedot( problem=problem, seed=42, preset='light', timeout=timeout, composer_params={'initial_pipeline': non_atomized_model}, verbose_level=2) auto_model_from_pipeline.fit(features=deepcopy(train_data), target='target') auto_model_from_pipeline.predict_proba(features=deepcopy(test_data)) auto_model_from_pipeline.current_pipeline.show() print('auto_model_from_pipeline', auto_model_from_pipeline.get_metrics(deepcopy(test_data_target)))