def test_forecast_with_exog(): train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts( ) # Source data for lagged node node_lagged = PrimaryNode('lagged') # Set window size for lagged transformation node_lagged.custom_params = {'window_size': window_size} # Exogenous variable for exog node node_exog = PrimaryNode('exog_ts_data_source') node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog]) pipeline = Pipeline(node_final) pipeline.fit(input_data=MultiModalData({ 'exog_ts_data_source': train_exog_ts, 'lagged': train_source_ts })) forecast = pipeline.predict( input_data=MultiModalData({ 'exog_ts_data_source': predict_exog_ts, 'lagged': predict_source_ts })) prediction = np.ravel(np.array(forecast.predict)) assert tuple(prediction) == tuple(ts_test)
def fit(self, features: Union[str, np.ndarray, pd.DataFrame, InputData, dict], target: Union[str, np.ndarray, pd.Series] = 'target', predefined_model: Union[str, Pipeline] = None): """ Fit the graph with a predefined structure or compose and fit the new graph :param features: the array with features of train data :param target: the array with target values of train data :param predefined_model: the name of the atomic model or Pipeline instance :return: Pipeline object """ self.target_name = target self.train_data = _define_data(ml_task=self.problem, features=features, target=target, is_predict=False) is_composing_required = True if self.current_pipeline is not None: is_composing_required = False if predefined_model is not None: is_composing_required = False if isinstance(predefined_model, Pipeline): self.current_pipeline = predefined_model elif isinstance(predefined_model, str): self.current_pipeline = Pipeline(PrimaryNode(predefined_model)) else: raise ValueError( f'{type(predefined_model)} is not supported as Fedot model' ) return self._obtain_model(is_composing_required)
def create_pipeline_with_several_nested_atomized_model() -> Pipeline: pipeline = Pipeline() atomized_op = create_atomized_model_with_several_atomized_models() node_atomized_model = PrimaryNode(operation_type=atomized_op) node_atomized_model_secondary = SecondaryNode( operation_type=create_atomized_model()) node_atomized_model_secondary.nodes_from = [node_atomized_model] node_knn = SecondaryNode('knn') node_knn.custom_params = {'n_neighbors': 9} node_knn.nodes_from = [node_atomized_model] node_knn_second = SecondaryNode('knn') node_knn_second.custom_params = {'n_neighbors': 5} node_knn_second.nodes_from = [ node_atomized_model, node_atomized_model_secondary, node_knn ] node_atomized_model_secondary_second = \ SecondaryNode(operation_type=create_atomized_model_with_several_atomized_models()) node_atomized_model_secondary_second.nodes_from = [node_knn_second] pipeline.add_node(node_atomized_model_secondary_second) return pipeline
def test_pipeline_sequential_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) final = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, final]: pipeline.add_node(node) train_predicted = pipeline.fit(input_data=train, use_fitted=False) assert pipeline.root_node.descriptive_id == ( '(((/n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert pipeline.length == 4 assert pipeline.depth == 4 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.fitted_operation is not None
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(f'ROC AUC for TPOT: {roc_auc_value}') node_scaling = PrimaryNode('scaling') node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling]) pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(f'ROC AUC for FEDOT: {roc_auc_value}') return roc_auc_value
def get_roc_auc_value(pipeline: Pipeline, train_data: InputData, test_data: InputData) -> (float, float): train_pred = pipeline.predict(input_data=train_data) test_pred = pipeline.predict(input_data=test_data) roc_auc_value_test = roc_auc(y_true=test_data.target, y_score=test_pred.predict) roc_auc_value_train = roc_auc(y_true=train_data.target, y_score=train_pred.predict) return roc_auc_value_train, roc_auc_value_test
def run_pipeline_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run pipeline with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for pipeline """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) OperationTypesRepository.assign_repo('model', 'automl_repository.json') pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_import_export_example(pipeline_path): # Prepare data to train the model train_data, test_data = get_scoring_data() # Get pipeline and fit it pipeline = get_three_depth_manual_class_pipeline() pipeline.fit_from_scratch(train_data) predicted_output = pipeline.predict(test_data) prediction_before_export = np.array(predicted_output.predict) print(f'Before export {prediction_before_export[:4]}') NodesAnalysis( pipeline, train_data, test_data, approaches=[NodeDeletionAnalyze, NodeReplaceOperationAnalyze]).analyze() # Export it pipeline.save(path=pipeline_path) # Import pipeline json_path_load = create_correct_path(pipeline_path) new_pipeline = Pipeline() new_pipeline.load(json_path_load) predicted_output_after_export = new_pipeline.predict(test_data) prediction_after_export = np.array(predicted_output_after_export.predict) print(f'After import {prediction_after_export[:4]}')
def _get_metric_value(self, pipeline: Pipeline, metric: MetricByTask) -> float: pipeline.fit(self._train_data, use_fitted=False) predicted = pipeline.predict(self._test_data) metric_value = metric.get_value(true=self._test_data, predicted=predicted) return metric_value
def restore(self, opt_graph: OptGraph): # TODO improve transformation pipeline_nodes = [] for node in opt_graph.nodes: self._transform_to_pipeline_node(node) pipeline_nodes.append(node) pipeline = Pipeline(pipeline_nodes) pipeline.uid = opt_graph.uid return pipeline
def pipeline_with_multiple_roots(): first = PrimaryNode(operation_type='logit') root_first = SecondaryNode(operation_type='logit', nodes_from=[first]) root_second = SecondaryNode(operation_type='logit', nodes_from=[first]) pipeline = Pipeline() for node in [first, root_first, root_second]: pipeline.add_node(node) return pipeline
def pipeline_with_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) pipeline = Pipeline() for node in [first, second, third]: pipeline.add_node(node) return pipeline
def pipeline_with_isolated_components(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[]) fourth = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, fourth]: pipeline.add_node(node) return pipeline
def pipeline_third(): # QDA # | \ # RF RF pipeline = Pipeline() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) pipeline.add_node(new_node) [pipeline.add_node(node_from) for node_from in new_node.nodes_from] return pipeline
def test_data_preparation_for_multi_target_correct(multi_target_data_setup): train, test = multi_target_data_setup simple_pipeline = Pipeline(PrimaryNode('linear')) simple_pipeline.fit(input_data=train) source_shape = test.target.shape # Get converted data results, new_test = QualityMetric()._simple_prediction( simple_pipeline, test) number_elements = len(new_test.target) assert source_shape[0] * source_shape[1] == number_elements
def test_import_json_to_pipeline_correctly(): json_path_load = create_correct_path('test_pipeline_convert_to_json') pipeline = Pipeline() pipeline.load(json_path_load) json_actual = pipeline.save('test_import_json_to_pipeline_correctly_1') pipeline_expected = create_pipeline() json_expected = pipeline_expected.save('test_import_json_to_pipeline_correctly_2') assert json.dumps(json_actual) == json.dumps(json_expected)
def test_import_json_to_fitted_pipeline_correctly(): json_path_load = create_correct_path('test_fitted_pipeline_convert_to_json') pipeline = Pipeline() pipeline.load(json_path_load) json_actual = pipeline.save('test_import_json_to_fitted_pipeline_correctly') with open(json_path_load, 'r') as json_file: json_expected = json.load(json_file) assert json_actual == json.dumps(json_expected, indent=4)
def test_pipeline_with_wrong_data(): pipeline = Pipeline(PrimaryNode('linear')) data_seq = np.arange(0, 10) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=10)) data = InputData(idx=data_seq, features=data_seq, target=data_seq, data_type=DataTypesEnum.ts, task=task) with pytest.raises(ValueError): pipeline.fit(data)
def valid_pipeline(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) last = SecondaryNode(operation_type='logit', nodes_from=[third]) pipeline = Pipeline() for node in [first, second, third, last]: pipeline.add_node(node) return pipeline
def get_rmse_value(pipeline: Pipeline, train_data: InputData, test_data: InputData) -> (float, float): train_pred = pipeline.predict(input_data=train_data) test_pred = pipeline.predict(input_data=test_data) rmse_value_test = mse(y_true=test_data.target, y_pred=test_pred.predict, squared=False) rmse_value_train = mse(y_true=train_data.target, y_pred=train_pred.predict, squared=False) return rmse_value_train, rmse_value_test
def create_json_models_files(): """ Creating JSON's files for test before tests. """ pipeline = create_pipeline() pipeline.save('test_pipeline_convert_to_json') pipeline_fitted = create_fitted_pipeline() pipeline_fitted.save('test_fitted_pipeline_convert_to_json') pipeline_empty = Pipeline() pipeline_empty.save('test_empty_pipeline_convert_to_json')
def test_pipeline_repr(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) pipeline = Pipeline() pipeline.add_node(final) expected_pipeline_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}" assert repr(pipeline) == expected_pipeline_description
def test_ts_forecasting_lagged_data_operation(): train_input, predict_input, y_test = get_time_series() node_lagged = PrimaryNode('lagged') node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) pipeline = Pipeline(node_ridge) pipeline.fit_from_scratch(train_input) predicted_output = pipeline.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling pipeline. Fit predict it scaling_pipeline = Pipeline(PrimaryNode('normalization')) scaling_pipeline.fit(train_data) scaled_data = scaling_pipeline.predict(train_data) kmeans = Model(operation_type='kmeans') _, train_predicted = kmeans.fit(data=scaled_data) assert all(np.unique(train_predicted.predict) == [0, 1])
def execute_pipeline_for_text_problem(train_data, test_data): node_text_clean = PrimaryNode('text_clean') node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean]) model_node = SecondaryNode('multinb', nodes_from=[node_tfidf]) pipeline = Pipeline(model_node) pipeline.fit(train_data) predicted = pipeline.predict(test_data) roc_auc_metric = roc_auc(y_true=test_data.target, y_score=predicted.predict) return roc_auc_metric
def test_save_load_fitted_atomized_pipeline_correctly(): pipeline = create_pipeline_with_several_nested_atomized_model() train_data, test_data = create_data_for_train() pipeline.fit(train_data) json_actual = pipeline.save( 'test_save_load_fitted_atomized_pipeline_correctly') json_path_load = create_correct_path( 'test_save_load_fitted_atomized_pipeline_correctly') pipeline_loaded = Pipeline() pipeline_loaded.load(json_path_load) json_expected = pipeline_loaded.save( 'test_save_load_fitted_atomized_pipeline_correctly_loaded') assert pipeline.length == pipeline_loaded.length assert json_actual == json_expected before_save_predicted = pipeline.predict(test_data) pipeline_loaded.fit(train_data) after_save_predicted = pipeline_loaded.predict(test_data) bfr_tun_mse = mean_squared_error(y_true=test_data.target, y_pred=before_save_predicted.predict) aft_tun_mse = mean_squared_error(y_true=test_data.target, y_pred=after_save_predicted.predict) assert aft_tun_mse <= bfr_tun_mse
def create_pipeline() -> Pipeline: pipeline = Pipeline() node_logit = PrimaryNode('logit') node_lda = PrimaryNode('lda') node_lda.custom_params = {'n_components': 1} node_xgboost = SecondaryNode('xgboost') node_xgboost.custom_params = {'n_components': 1} node_xgboost.nodes_from = [node_logit, node_lda] pipeline.add_node(node_xgboost) return pipeline
def test_multi_modal_pipeline(): task = Task(TaskTypesEnum.classification) images_size = (128, 128) files_path = os.path.join('test', 'data', 'multi_modal') path = os.path.join(str(fedot_project_root()), files_path) train_num, _, train_img, _, train_text, _ = \ prepare_multi_modal_data(path, task, images_size, with_split=False) # image image_node = PrimaryNode('cnn') image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 1, 'batch_size': 128} # image ds_image = PrimaryNode('data_source_img') image_node = SecondaryNode('cnn', nodes_from=[ds_image]) image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1), 'architecture': 'simplified', 'num_classes': 2, 'epochs': 15, 'batch_size': 128} # table ds_table = PrimaryNode('data_source_table') scaling_node = SecondaryNode('scaling', nodes_from=[ds_table]) numeric_node = SecondaryNode('rf', nodes_from=[scaling_node]) # text ds_text = PrimaryNode('data_source_text') node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text]) text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean]) pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node])) fit_data = MultiModalData({ 'data_source_img': train_img, 'data_source_table': train_num, 'data_source_text': train_text }) pipeline.fit(fit_data) prediction = pipeline.predict(fit_data) assert prediction is not None
def sample_pipeline(): return Pipeline( SecondaryNode(operation_type='logit', nodes_from=[ PrimaryNode(operation_type='xgboost'), PrimaryNode(operation_type='scaling') ]))
def return_working_pipeline(): node_lagged_1 = PrimaryNode('lagged/1') node_exog = PrimaryNode('exog_ts_data_source') node_final = SecondaryNode('ridge', nodes_from=[node_lagged_1, node_exog]) pipeline = Pipeline(node_final) return pipeline