def test_save_load_fitted_atomized_pipeline_correctly(): pipeline = create_pipeline_with_several_nested_atomized_model() train_data, test_data = create_data_for_train() pipeline.fit(train_data) json_actual = pipeline.save( 'test_save_load_fitted_atomized_pipeline_correctly') json_path_load = create_correct_path( 'test_save_load_fitted_atomized_pipeline_correctly') pipeline_loaded = Pipeline() pipeline_loaded.load(json_path_load) json_expected = pipeline_loaded.save( 'test_save_load_fitted_atomized_pipeline_correctly_loaded') assert pipeline.length == pipeline_loaded.length assert json_actual == json_expected before_save_predicted = pipeline.predict(test_data) pipeline_loaded.fit(train_data) after_save_predicted = pipeline_loaded.predict(test_data) bfr_tun_mse = mean_squared_error(y_true=test_data.target, y_pred=before_save_predicted.predict) aft_tun_mse = mean_squared_error(y_true=test_data.target, y_pred=after_save_predicted.predict) assert aft_tun_mse <= bfr_tun_mse
def run_import_export_example(pipeline_path): # Prepare data to train the model train_data, test_data = get_scoring_data() # Get pipeline and fit it pipeline = get_three_depth_manual_class_pipeline() pipeline.fit_from_scratch(train_data) predicted_output = pipeline.predict(test_data) prediction_before_export = np.array(predicted_output.predict) print(f'Before export {prediction_before_export[:4]}') NodesAnalysis( pipeline, train_data, test_data, approaches=[NodeDeletionAnalyze, NodeReplaceOperationAnalyze]).analyze() # Export it pipeline.save(path=pipeline_path) # Import pipeline json_path_load = create_correct_path(pipeline_path) new_pipeline = Pipeline() new_pipeline.load(json_path_load) predicted_output_after_export = new_pipeline.predict(test_data) prediction_after_export = np.array(predicted_output_after_export.predict) print(f'After import {prediction_after_export[:4]}')
def test_import_json_to_fitted_pipeline_correctly(): json_path_load = create_correct_path('test_fitted_pipeline_convert_to_json') pipeline = Pipeline() pipeline.load(json_path_load) json_actual = pipeline.save('test_import_json_to_fitted_pipeline_correctly') with open(json_path_load, 'r') as json_file: json_expected = json.load(json_file) assert json_actual == json.dumps(json_expected, indent=4)
def test_import_json_to_pipeline_correctly(): json_path_load = create_correct_path('test_pipeline_convert_to_json') pipeline = Pipeline() pipeline.load(json_path_load) json_actual = pipeline.save('test_import_json_to_pipeline_correctly_1') pipeline_expected = create_pipeline() json_expected = pipeline_expected.save('test_import_json_to_pipeline_correctly_2') assert json.dumps(json_actual) == json.dumps(json_expected)
def test_import_custom_json_object_to_pipeline_and_fit_correctly_no_exception(): test_file_path = str(os.path.dirname(__file__)) file = '../../data/test_custom_json_template.json' json_path_load = os.path.join(test_file_path, file) train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) pipeline = Pipeline() pipeline.load(json_path_load) pipeline.fit(train_data) pipeline.save('test_import_custom_json_object_to_pipeline_and_fit_correctly_no_exception')
def test_save_load_atomized_pipeline_correctly(): pipeline = create_pipeline_with_several_nested_atomized_model() json_actual = pipeline.save('test_save_load_atomized_pipeline_correctly') json_path_load = create_correct_path( 'test_save_load_atomized_pipeline_correctly') with open(json_path_load, 'r') as json_file: json_expected = json.load(json_file) pipeline_loaded = Pipeline() pipeline_loaded.load(json_path_load) assert pipeline.length == pipeline_loaded.length assert json_actual == json.dumps(json_expected, indent=4)
def test_fitted_pipeline_cache_correctness_after_export_and_import(): train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) pipeline = create_classification_pipeline_with_preprocessing() pipeline.fit(train_data) pipeline.save('test_fitted_pipeline_cache_correctness_after_export_and_import') prediction = pipeline.predict(test_data) new_pipeline = Pipeline() new_pipeline.load(create_correct_path('test_fitted_pipeline_cache_correctness_after_export_and_import')) new_prediction = new_pipeline.predict(test_data) assert np.array_equal(prediction.predict, new_prediction.predict) assert new_pipeline.is_fitted
def run_import_export_example(pipeline_path): features_options = {'informative': 1, 'bias': 0.0} samples_amount = 100 features_amount = 2 x_train, y_train, x_test, y_test = get_regression_dataset( features_options, samples_amount, features_amount) # Define regression task task = Task(TaskTypesEnum.regression) # Prepare data to train the model train_input = InputData(idx=np.arange(0, len(x_train)), features=x_train, target=y_train, task=task, data_type=DataTypesEnum.table) predict_input = InputData(idx=np.arange(0, len(x_test)), features=x_test, target=None, task=task, data_type=DataTypesEnum.table) # Get pipeline and fit it pipeline = get_pipeline() pipeline.fit_from_scratch(train_input) predicted_output = pipeline.predict(predict_input) prediction_before_export = np.array(predicted_output.predict) print(f'Before export {prediction_before_export[:4]}') # Export it pipeline.save(path=pipeline_path) # Import pipeline json_path_load = create_correct_path(pipeline_path) new_pipeline = Pipeline() new_pipeline.load(json_path_load) predicted_output_after_export = new_pipeline.predict(predict_input) prediction_after_export = np.array(predicted_output_after_export.predict) print(f'After import {prediction_after_export[:4]}')
def __init__(self, node: Node = None, operation_id: int = None, nodes_from: list = None, path: str = None): # Need use the imports inside the class because of the problem of circular imports. from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.template import PipelineTemplate from fedot.core.operations.atomized_model import AtomizedModel super().__init__() self.atomized_model_json_path = None self.next_pipeline_template = None self.pipeline_template = None if path: pipeline = Pipeline() pipeline.load(path) self.next_pipeline_template = AtomizedModel(pipeline) self.pipeline_template = PipelineTemplate(pipeline) if node: self._operation_to_template(node, operation_id, nodes_from)
def run_oil_forecasting(path_to_file, path_to_file_crm, len_forecast, len_forecast_full, ax, well_id, timeout): if timeout is None: timeout = 1 df = pd.read_csv(path_to_file, sep=' *, *') df_crm = pd.read_csv(path_to_file_crm, sep=' *, *') len_forecast_for_split = len_forecast_full dates, target_train, data_fit, data_predict, input_data_fit, input_data_predict, test_data, \ train_data, time_series = prepare_dataset(df, len_forecast, len_forecast_for_split, well_id) dates, target_train_crm, data_fit_crm, data_predict_crm, input_data_fit_crm, input_data_predict_crm, test_data_crm, \ train_data, time_series = prepare_dataset(df_crm, len_forecast, len_forecast_for_split, well_id) task_parameters = TsForecastingParams(forecast_length=len_forecast) if not os.path.exists(f'pipeline_{well_id}/pipeline_{well_id}.json'): model = Fedot(problem='ts_forecasting', task_params=task_parameters, composer_params={'timeout': timeout}, preset='ultra_light', verbose_level=4) # run AutoML model design in the same way pipeline = model.fit(features=data_fit, target=target_train) pipeline.save(f'pipeline_{well_id}') # , datetime_in_path=False) else: pipeline = Pipeline() pipeline.load(f'pipeline_{well_id}/pipeline_{well_id}.json') if not os.path.exists( f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json'): model = Fedot(problem='ts_forecasting', task_params=task_parameters, composer_params={'timeout': timeout}, preset='ultra_light', verbose_level=4) # run AutoML model design in the same way pipeline_crm = model.fit(features=data_fit_crm, target=target_train_crm) pipeline_crm.save( f'pipeline_crm_{well_id}') # , datetime_in_path=False) else: pipeline_crm = Pipeline() pipeline_crm.load( f'pipeline_crm_{well_id}/pipeline_crm_{well_id}.json') sources = dict( (f'data_source_ts/{data_part_key}', data_part) for (data_part_key, data_part) in input_data_predict.items()) input_data_predict_mm = MultiModalData(sources) sources_crm = dict( (f'data_source_ts/{data_part_key}', data_part) for (data_part_key, data_part) in input_data_predict_crm.items()) input_data_predict_mm_crm = MultiModalData(sources_crm) forecast = in_sample_ts_forecast(pipeline, input_data_predict_mm, horizon=len_forecast_full) forecast_crm = in_sample_ts_forecast(pipeline_crm, input_data_predict_mm_crm, horizon=len_forecast_full) predicted = np.ravel(np.array(forecast)) predicted_crm = np.ravel(np.array(forecast_crm)) predicted_only_crm = np.asarray( df_crm[f'crm_{well_id}'][-len_forecast_full:]) test_data = np.ravel(test_data) print('CRM') predicted_only_crm[np.isnan(predicted_only_crm)] = 0 mse_before = mean_squared_error(test_data, predicted_only_crm, squared=False) mae_before = mean_absolute_error(test_data, predicted_only_crm) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') print('ML') mse_before = mean_squared_error(test_data, predicted, squared=False) mae_before = mean_absolute_error(test_data, predicted) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') print('AutoML+CRM') mse_before = mean_squared_error(test_data, predicted_crm, squared=False) mae_before = mean_absolute_error(test_data, predicted_crm) print(f'RMSE - {mse_before:.4f}') print(f'MAE - {mae_before:.4f}\n') if ax: x_for = range(len(train_data), len(time_series)) ax.plot(x_for, time_series[-len_forecast_full:], label='Actual time series', linewidth=0.5) ax.plot(x_for, predicted_crm, label='AutoML+CRM', linewidth=0.5) ax.plot(x_for, predicted_only_crm, label='CRM', linewidth=0.5) ci_crm = t_conf_interval(np.std(predicted_crm), 0.975, len(predicted_crm)) * 1.96 ax.fill_between(x_for, (predicted_crm - ci_crm), (predicted_crm + ci_crm), color='orange', alpha=.5) ci_crmonly = t_conf_interval(np.std(predicted_only_crm), 0.975, len(predicted_only_crm)) * 1.96 ax.fill_between(x_for, (predicted_only_crm - ci_crmonly), (predicted_only_crm + ci_crmonly), color='green', alpha=.5) ax.set(xlabel='Days from 2013.06.01', ylabel='Oil volume, m3') if well_id == '5351': ax.legend() ax.set_title(well_id) ax.plot()
class Fedot: """ Main class for FEDOT API :param problem: the name of modelling problem to solve: - classification - regression - ts_forecasting - clustering :param preset: name of preset for model building (e.g. 'light', 'ultra-light') :param timeout: time for model design (in minutes) :param composer_params: parameters of pipeline optimisation The possible parameters are: 'max_depth' - max depth of the pipeline 'max_arity' - max arity of the pipeline nodes 'pop_size' - population size for composer 'num_of_generations' - number of generations for composer 'timeout':- composing time (minutes) 'available_operations' - list of model names to use 'with_tuning' - allow huperparameters tuning for the model 'cv_folds' - number of folds for cross-validation 'validation_blocks' - number of validation blocks for time series forecasting :param task_params: additional parameters of the task :param seed: value for fixed random seed :param verbose_level: level of the output detailing (-1 - nothing, 0 - errors, 1 - messages, 2 - warnings and info, 3-4 - basic and detailed debug) """ def __init__(self, problem: str, preset: str = None, timeout: Optional[float] = None, composer_params: dict = None, task_params: TaskParams = None, seed=None, verbose_level: int = 1): if seed is not None: np.random.seed(seed) random.seed(seed) # metainfo self.problem = problem self.composer_params = composer_params self.task_params = task_params # model to use self.current_pipeline = None # best models for multi-objective case self.best_models = None # composer history self.history = None # datasets self.train_data = None self.test_data = None self.prediction = None self.prediction_labels = None # classification-only self.target_name = None self.log = default_log('FEDOT logger', verbose_level=verbose_level) if self.composer_params is None: self.composer_params = default_evo_params(self.problem) else: self.composer_params = { **default_evo_params(self.problem), **self.composer_params } self.metric_to_compose = None if 'metric' in self.composer_params: self.composer_params['composer_metric'] = self.composer_params[ 'metric'] del self.composer_params['metric'] self.metric_to_compose = self.composer_params['composer_metric'] if timeout is not None: self.composer_params['timeout'] = timeout self.composer_params[ 'num_of_generations'] = 10000 # num of generation is limited by time now if self.problem == 'ts_forecasting' and task_params is None: self.task_params = TsForecastingParams(forecast_length=30) task_dict = { 'regression': Task(TaskTypesEnum.regression, task_params=self.task_params), 'classification': Task(TaskTypesEnum.classification, task_params=self.task_params), 'clustering': Task(TaskTypesEnum.clustering, task_params=self.task_params), 'ts_forecasting': Task(TaskTypesEnum.ts_forecasting, task_params=self.task_params) } if self.problem == 'clustering': raise ValueError( 'This type of task is not not supported in API now') self.metric_name = default_test_metric_dict[self.problem] self.problem = task_dict[self.problem] if preset is None and 'preset' in self.composer_params: preset = self.composer_params['preset'] if 'preset' in self.composer_params: del self.composer_params['preset'] if preset is not None: available_operations = filter_operations_by_preset( self.problem, preset) self.composer_params['available_operations'] = available_operations self.composer_params[ 'with_tuning'] = '_tun' in preset or preset is None def _get_params(self): param_dict = { 'train_data': self.train_data, 'task': self.problem, 'logger': self.log } return {**param_dict, **self.composer_params} def _obtain_model(self, is_composing_required: bool = True): execution_params = self._get_params() if is_composing_required: self.current_pipeline, self.best_models, self.history = compose_fedot_model( **execution_params) if isinstance(self.best_models, tools.ParetoFront): self.best_models.__class__ = ParetoFront self.best_models.objective_names = self.metric_to_compose self.current_pipeline.fit_from_scratch(self.train_data) return self.current_pipeline def clean(self): """ Cleans fitted model and obtained predictions """ self.prediction = None self.prediction_labels = None self.current_pipeline = None def fit(self, features: Union[str, np.ndarray, pd.DataFrame, InputData, dict], target: Union[str, np.ndarray, pd.Series] = 'target', predefined_model: Union[str, Pipeline] = None): """ Fit the graph with a predefined structure or compose and fit the new graph :param features: the array with features of train data :param target: the array with target values of train data :param predefined_model: the name of the atomic model or Pipeline instance :return: Pipeline object """ self.target_name = target self.train_data = _define_data(ml_task=self.problem, features=features, target=target, is_predict=False) is_composing_required = True if self.current_pipeline is not None: is_composing_required = False if predefined_model is not None: is_composing_required = False if isinstance(predefined_model, Pipeline): self.current_pipeline = predefined_model elif isinstance(predefined_model, str): self.current_pipeline = Pipeline(PrimaryNode(predefined_model)) else: raise ValueError( f'{type(predefined_model)} is not supported as Fedot model' ) return self._obtain_model(is_composing_required) def predict(self, features: Union[str, np.ndarray, pd.DataFrame, InputData, dict], save_predictions: bool = False): """ Predict new target using already fitted model :param features: the array with features of test data :param save_predictions: if True-save predictions as csv-file in working directory. :return: the array with prediction values """ if self.current_pipeline is None: raise ValueError(NOT_FITTED_ERR_MSG) self.test_data = _define_data(ml_task=self.problem, target=self.target_name, features=features, is_predict=True) if self.problem.task_type == TaskTypesEnum.classification: self.prediction_labels = self.current_pipeline.predict( self.test_data, output_mode='labels') self.prediction = self.current_pipeline.predict( self.test_data, output_mode='probs') output_prediction = self.prediction elif self.problem.task_type == TaskTypesEnum.ts_forecasting: # Convert forecast into one-dimensional array self.prediction = self.current_pipeline.predict(self.test_data) forecast = np.ravel(np.array(self.prediction.predict)) self.prediction.predict = forecast output_prediction = self.prediction else: self.prediction = self.current_pipeline.predict(self.test_data) output_prediction = self.prediction if save_predictions: save_predict(self.prediction) return output_prediction.predict def predict_proba(self, features: Union[str, np.ndarray, pd.DataFrame, InputData, dict], save_predictions: bool = False, probs_for_all_classes: bool = False): """ Predict the probability of new target using already fitted classification model :param features: the array with features of test data :param save_predictions: if True-save predictions as csv-file in working directory. :param probs_for_all_classes: return probability for each class even for binary case :return: the array with prediction values """ if self.current_pipeline is None: raise ValueError(NOT_FITTED_ERR_MSG) if self.problem.task_type == TaskTypesEnum.classification: self.test_data = _define_data(ml_task=self.problem, target=self.target_name, features=features, is_predict=True) mode = 'full_probs' if probs_for_all_classes else 'probs' self.prediction = self.current_pipeline.predict(self.test_data, output_mode=mode) self.prediction_labels = self.current_pipeline.predict( self.test_data, output_mode='labels') if save_predictions: save_predict(self.prediction) else: raise ValueError( 'Probabilities of predictions are available only for classification' ) return self.prediction.predict def forecast(self, pre_history: Union[str, Tuple[np.ndarray, np.ndarray], InputData, dict], forecast_length: int = 1, save_predictions: bool = False): """ Forecast the new values of time series :param pre_history: the array with features for pre-history of the forecast :param forecast_length: num of steps to forecast :param save_predictions: if True-save predictions as csv-file in working directory. :return: the array with prediction values """ # TODO use forecast length if self.current_pipeline is None: raise ValueError(NOT_FITTED_ERR_MSG) if self.problem.task_type != TaskTypesEnum.ts_forecasting: raise ValueError( 'Forecasting can be used only for the time series') self.problem = self.train_data.task self.test_data = _define_data(ml_task=self.problem, target=self.target_name, features=pre_history, is_predict=True) self.current_pipeline = Pipeline(self.current_pipeline.root_node) # TODO add incremental forecast self.prediction = self.current_pipeline.predict(self.test_data) if len(self.prediction.predict.shape) > 1: self.prediction.predict = np.squeeze(self.prediction.predict) if save_predictions: save_predict(self.prediction) return self.prediction.predict def load(self, path): """ Load saved graph from disk :param path to json file with model """ self.current_pipeline.load(path) def plot_prediction(self): """ Plot the prediction obtained from graph """ if self.prediction is not None: if self.problem.task_type == TaskTypesEnum.ts_forecasting: plot_forecast(pre_history=self.train_data, forecast=self.prediction) else: # TODO implement other visualizations self.log.error('Not supported yet') else: self.log.error('No prediction to visualize') def get_metrics(self, target: Union[np.ndarray, pd.Series] = None, metric_names: Union[str, List[str]] = None) -> dict: """ Get quality metrics for the fitted graph :param target: the array with target values of test data :param metric_names: the names of required metrics :return: the values of quality metrics """ if metric_names is None: metric_names = self.metric_name if target is not None: if self.test_data is None: self.test_data = InputData( idx=range(len(self.prediction.predict)), features=None, target=target[:len(self.prediction.predict)], task=self.train_data.task, data_type=self.train_data.data_type) else: self.test_data.target = target[:len(self.prediction.predict)] real = self.test_data # TODO change to sklearn metrics if not isinstance(metric_names, List): metric_names = [metric_names] calculated_metrics = dict() for metric_name in metric_names: if composer_metrics_mapping[metric_name] is NotImplemented: self.log.warn(f'{metric_name} is not available as metric') else: prediction = self.prediction metric_cls = MetricsRepository().metric_class_by_id( composer_metrics_mapping[metric_name]) if metric_cls.output_mode == 'labels': prediction = self.prediction_labels if self.problem.task_type == TaskTypesEnum.ts_forecasting: real.target = real.target[~np.isnan(prediction.predict)] prediction.predict = prediction.predict[ ~np.isnan(prediction.predict)] metric_value = abs( metric_cls.metric(reference=real, predicted=prediction)) calculated_metrics[metric_name] = metric_value return calculated_metrics