def sample_pipeline(): return Pipeline( SecondaryNode(operation_type='logit', nodes_from=[ PrimaryNode(operation_type='xgboost'), PrimaryNode(operation_type='scaling') ]))
def get_simple_pipeline(): """ Function returns simple pipeline """ node_lagged = PrimaryNode('lagged') node_lagged.custom_params = {'window_size': 150} node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) ridge_pipeline = Pipeline(node_ridge) return ridge_pipeline
def ts_pipeline_with_incorrect_data_flow(): """ Connection lagged -> lagged is incorrect Connection ridge -> ar is incorrect also lagged - lagged - ridge \ ar -> final forecast lagged - ridge / """ # First level node_lagged = PrimaryNode('lagged') # Second level node_lagged_1 = SecondaryNode('lagged', nodes_from=[node_lagged]) node_lagged_2 = PrimaryNode('lagged') # Third level node_ridge_1 = SecondaryNode('ridge', nodes_from=[node_lagged_1]) node_ridge_2 = SecondaryNode('ridge', nodes_from=[node_lagged_2]) # Fourth level - root node node_final = SecondaryNode('ar', nodes_from=[node_ridge_1, node_ridge_2]) pipeline = Pipeline(node_final) return pipeline
def run_pipeline_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run pipeline with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for pipeline """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) OperationTypesRepository.assign_repo('model', 'automl_repository.json') pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_one_model_with_specific_evaluation_mod(train_data, test_data, mode: str = None): """ Runs the example with one model svc. :param train_data: train data for pipeline training :param test_data: test data for pipeline training :param mode: pass gpu flag to make gpu evaluation """ problem = 'classification' if mode == 'gpu': baseline_model = Fedot(problem=problem, preset='gpu') else: baseline_model = Fedot(problem=problem) svc_node_with_custom_params = PrimaryNode('svc') # the custom params are needed to make probability evaluation available # otherwise an error is occurred svc_node_with_custom_params.custom_params = dict(kernel='rbf', C=10, gamma=1, cache_size=2000, probability=True) preset_pipeline = Pipeline(svc_node_with_custom_params) start = datetime.now() baseline_model.fit(features=train_data, target='target', predefined_model=preset_pipeline) print(f'Completed with custom params in: {datetime.now() - start}') baseline_model.predict(features=test_data) print(baseline_model.get_metrics())
def return_working_pipeline(): node_lagged_1 = PrimaryNode('lagged/1') node_exog = PrimaryNode('exog_ts_data_source') node_final = SecondaryNode('ridge', nodes_from=[node_lagged_1, node_exog]) pipeline = Pipeline(node_final) return pipeline
def test_forecast_with_exog(): train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts( ) # Source data for lagged node node_lagged = PrimaryNode('lagged') # Set window size for lagged transformation node_lagged.custom_params = {'window_size': window_size} # Exogenous variable for exog node node_exog = PrimaryNode('exog_ts_data_source') node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog]) pipeline = Pipeline(node_final) pipeline.fit(input_data=MultiModalData({ 'exog_ts_data_source': train_exog_ts, 'lagged': train_source_ts })) forecast = pipeline.predict( input_data=MultiModalData({ 'exog_ts_data_source': predict_exog_ts, 'lagged': predict_source_ts })) prediction = np.ravel(np.array(forecast.predict)) assert tuple(prediction) == tuple(ts_test)
def get_complex_pipeline(): """ Pipeline looking like this smoothing - lagged - ridge \ \ ridge -> final forecast / lagged - ridge / """ # First level node_smoothing = PrimaryNode('smoothing') # Second level node_lagged_1 = SecondaryNode('lagged', nodes_from=[node_smoothing]) node_lagged_2 = PrimaryNode('lagged') # Third level node_ridge_1 = SecondaryNode('ridge', nodes_from=[node_lagged_1]) node_ridge_2 = SecondaryNode('ridge', nodes_from=[node_lagged_2]) # Fourth level - root node node_final = SecondaryNode('ridge', nodes_from=[node_ridge_1, node_ridge_2]) pipeline = Pipeline(node_final) return pipeline
def get_nodes(): first_node = PrimaryNode('knn') second_node = PrimaryNode('knn') third_node = SecondaryNode('lda', nodes_from=[first_node, second_node]) root = SecondaryNode('logit', nodes_from=[third_node]) return [root, third_node, first_node, second_node]
def pipeline_with_incorrect_task_type(): first = PrimaryNode(operation_type='linear') second = PrimaryNode(operation_type='linear') final = SecondaryNode(operation_type='kmeans', nodes_from=[first, second]) pipeline = Pipeline(final) return pipeline, Task(TaskTypesEnum.classification)
def pipeline_with_incorrect_root_operation(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='logit') final = SecondaryNode(operation_type='scaling', nodes_from=[first, second]) pipeline = Pipeline(final) return pipeline
def get_complex_class_pipeline(): first = PrimaryNode(operation_type='xgboost') second = PrimaryNode(operation_type='pca') final = SecondaryNode(operation_type='logit', nodes_from=[first, second]) pipeline = Pipeline(final) return pipeline
def get_ts_pipeline(window_size): """ Function return pipeline with lagged transformation in it """ node_lagged = PrimaryNode('lagged') node_lagged.custom_params = {'window_size': window_size} node_final = SecondaryNode('ridge', nodes_from=[node_lagged]) pipeline = Pipeline(node_final) return pipeline
def get_simple_ts_pipeline(model_root: str = 'ridge', window_size: int = 20): node_lagged = PrimaryNode('lagged') node_lagged.custom_params = {'window_size': window_size} node_root = SecondaryNode(model_root, nodes_from=[node_lagged]) pipeline = Pipeline(node_root) return pipeline
def test_distance_to_primary_level(): first_node = PrimaryNode('knn') second_node = PrimaryNode('knn') third_node = SecondaryNode('lda', nodes_from=[first_node, second_node]) root = SecondaryNode('logit', nodes_from=[third_node]) distance = root.distance_to_primary_level assert distance == 2
def pipeline_with_incorrect_data_flow(): """ When combining the features in the presented pipeline, a table with 5 columns will turn into a table with 10 columns """ first = PrimaryNode(operation_type='scaling') second = PrimaryNode(operation_type='scaling') final = SecondaryNode(operation_type='ridge', nodes_from=[first, second]) pipeline = Pipeline(final) return pipeline
def get_simple_short_lagged_pipeline(): # Create simple pipeline for forecasting node_lagged = PrimaryNode('lagged') # Use 4 elements in time series as predictors node_lagged.custom_params = {'window_size': 4} node_final = SecondaryNode('linear', nodes_from=[node_lagged]) pipeline = Pipeline(node_final) return pipeline
def roll_pipeline_structure( self, operation_object: ['OperationTemplate', 'AtomizedModelTemplate'], visited_nodes: dict, path: str = None): """ The function recursively traverses all disjoint operations and connects the operations in a pipeline. :params operation_object: operationTemplate or AtomizedOperationTemplate :params visited_nodes: array to remember which node was visited :params path: path to save :return: root_node """ if operation_object.operation_id in visited_nodes: return visited_nodes[operation_object.operation_id] if operation_object.operation_type == atomized_model_type(): atomized_model = operation_object.next_pipeline_template if operation_object.nodes_from: node = SecondaryNode(operation_type=atomized_model) else: node = PrimaryNode(operation_type=atomized_model) else: if operation_object.nodes_from: node = SecondaryNode(operation_object.operation_type) else: node = PrimaryNode(operation_object.operation_type) node.operation.params = operation_object.params node.rating = operation_object.rating if hasattr( operation_object, 'fitted_operation_path' ) and operation_object.fitted_operation_path and path is not None: path_to_operation = os.path.join( path, operation_object.fitted_operation_path) if not os.path.isfile(path_to_operation): message = f"Fitted operation on the path: {path_to_operation} does not exist." self.log.error(message) raise FileNotFoundError(message) fitted_operation = joblib.load(path_to_operation) operation_object.fitted_operation = fitted_operation node.fitted_operation = fitted_operation nodes_from = [ operation_template for operation_template in self.operation_templates if operation_template.operation_id in operation_object.nodes_from ] node.nodes_from = [ self.roll_pipeline_structure(node_from, visited_nodes, path) for node_from in nodes_from ] visited_nodes[operation_object.operation_id] = node return node
def test_ordered_subnodes_hierarchy(): first_node = PrimaryNode('knn') second_node = PrimaryNode('knn') third_node = SecondaryNode('lda', nodes_from=[first_node, second_node]) root = SecondaryNode('logit', nodes_from=[third_node]) ordered_nodes = root.ordered_subnodes_hierarchy() assert len(ordered_nodes) == 4
def get_stlarima_pipeline(): """ Function return complex pipeline with the following structure stl_arima """ node_final = PrimaryNode('stl_arima') node_final.custom_params = {'period': 80, 'p': 2, 'd': 1, 'q': 0} pipeline = Pipeline(node_final) return pipeline
def get_pipeline(): knn_node = PrimaryNode('knn') lda_node = PrimaryNode('qda') xgb_node = PrimaryNode('xgboost') final = SecondaryNode('xgboost', nodes_from=[knn_node, lda_node, xgb_node]) pipeline = Pipeline(final) return pipeline
def get_simple_pipeline(log): first = PrimaryNode(operation_type='xgboost', log=log) second = PrimaryNode(operation_type='knn', log=log) final = SecondaryNode(operation_type='logit', nodes_from=[first, second], log=log) # if you do not pass the log object, Pipeline will create default log.log file placed in core pipeline = Pipeline(final, log=log) return pipeline
def get_non_refinement_pipeline(lagged): """ Create 4-level pipeline without decompose operation """ node_lagged = PrimaryNode('lagged') node_lagged.custom_params = {'window_size': lagged} node_lasso = SecondaryNode('lasso', nodes_from=[node_lagged]) node_dtreg = SecondaryNode('dtreg', nodes_from=[node_lagged]) node_dtreg.custom_params = {'max_depth': 3} final_node = SecondaryNode('ridge', nodes_from=[node_lasso, node_dtreg]) pipeline = Pipeline(final_node) return pipeline
def test_pipeline_repr(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) pipeline = Pipeline() pipeline.add_node(final) expected_pipeline_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}" assert repr(pipeline) == expected_pipeline_description
def get_arima_nemo_pipeline(): """ Function return complex pipeline with the following structure arima \ linear nemo | """ node_arima = PrimaryNode('arima') node_nemo = PrimaryNode('exog_ts_data_source') node_final = SecondaryNode('ridge', nodes_from=[node_arima, node_nemo]) pipeline = Pipeline(node_final) return pipeline
def create_classification_pipeline_with_preprocessing(): node_scaling = PrimaryNode('scaling') node_rfe = PrimaryNode('rfe_lin_class') xgb_node = SecondaryNode('xgboost', nodes_from=[node_scaling]) logit_node = SecondaryNode('logit', nodes_from=[node_rfe]) knn_root = SecondaryNode('knn', nodes_from=[xgb_node, logit_node]) pipeline = Pipeline(knn_root) return pipeline
def get_pipeline(): third_level_one = PrimaryNode('lda') second_level_one = SecondaryNode('qda', nodes_from=[third_level_one]) second_level_two = PrimaryNode('qda') first_level_one = SecondaryNode( 'knn', nodes_from=[second_level_one, second_level_two]) root = SecondaryNode(operation_type='logit', nodes_from=[first_level_one]) pipeline = Pipeline(root) return pipeline
def get_simple_pipeline(): """ Function return simple pipeline with the following structure: xgboost \ -> logit knn | """ first = PrimaryNode(operation_type='xgboost') second = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='logit', nodes_from=[first, second]) pipeline = Pipeline(final) return pipeline
def test_delete_node_with_redirection(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = SecondaryNode(operation_type='knn', nodes_from=[first, second]) final = SecondaryNode(operation_type='xgboost', nodes_from=[third]) pipeline = Pipeline() pipeline.add_node(final) pipeline.delete_node(third) assert len(pipeline.nodes) == 3 assert first in pipeline.root_node.nodes_from
def get_three_depth_manual_regr_pipeline(): xgb_primary = PrimaryNode('xgbreg') knn_primary = PrimaryNode('knnreg') dtreg_secondary = SecondaryNode('dtreg', nodes_from=[xgb_primary]) rfr_secondary = SecondaryNode('rfr', nodes_from=[knn_primary]) knnreg_root = SecondaryNode('knnreg', nodes_from=[dtreg_secondary, rfr_secondary]) pipeline = Pipeline(knnreg_root) return pipeline