def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type, tags=['simple']) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() # run the search of best suitable model chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def test_parameter_free_composer_build_chain_correct(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types, _ = ModelTypesRepository().suitable_model( task_type=TaskTypesEnum.classification) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5) opt_params = GPChainOptimiserParameters( genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free) builder = GPComposerBuilder(task=Task( TaskTypesEnum.classification)).with_requirements(req).with_metrics( metric_function).with_optimiser_parameters(opt_params) gp_composer = builder.build() chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose) chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_gp_composed = chain_gp_composed.predict(dataset_to_validate) roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target, y_score=predicted_gp_composed.predict) assert roc_on_valid_gp_composed > 0.6
def test_composer_cv_correct(): """ Checks if the composer works correctly when using cross validation for time series """ folds = 2 _, forecast_len, validation_blocks, time_series = configure_experiment() primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=3, pop_size=2, num_of_generations=2, crossover_prob=0.8, mutation_prob=0.8, timeout=datetime.timedelta(seconds=5), cv_folds=folds, validation_blocks=validation_blocks) init_pipeline = get_simple_ts_pipeline() metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) builder = GPComposerBuilder(task=time_series.task). \ with_requirements(composer_requirements). \ with_metrics(metric_function).with_initial_pipeline(init_pipeline) composer = builder.build() obtained_pipeline = composer.compose_pipeline(data=time_series, is_visualise=False) assert isinstance(obtained_pipeline, Pipeline)
def test_composer_with_cv_optimization_correct(): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose, dataset_to_validate = get_data(task) models_repo = OperationTypesRepository() available_model_types, _ = models_repo.suitable_operation( task_type=task.task_type, tags=['simple']) metric_function = [ ClassificationMetricsEnum.ROCAUC_penalty, ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss ] composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, timeout=timedelta(minutes=1), num_of_generations=2, cv_folds=3) builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_compose, is_visualise=False)[0] assert isinstance(pipeline_evo_composed, Pipeline) pipeline_evo_composed.fit(input_data=dataset_to_compose) predicted = pipeline_evo_composed.predict(dataset_to_validate) roc_on_valid_evo_composed = roc_auc(y_score=predicted.predict, y_true=dataset_to_validate.target) assert roc_on_valid_evo_composed > 0
def get_composed_chain(dataset_to_compose, task, metric_function): # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types = get_operations_for_task(task=task, mode='models') # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, allow_single_operations=False) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters(optimiser_parameters) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) return chain_evo_composed
def test_gp_composer_build_pipeline_correct(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data task = Task(TaskTypesEnum.classification) available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=task.task_type) metric_function = ClassificationMetricsEnum.ROCAUC req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5) builder = GPComposerBuilder(task).with_requirements(req).with_metrics( metric_function) gp_composer = builder.build() pipeline_gp_composed = gp_composer.compose_pipeline( data=dataset_to_compose) pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate) roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target, y_score=predicted_gp_composed.predict) assert roc_on_valid_gp_composed > 0.6
def test_gp_composer_with_start_depth(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data available_model_types = ['xgboost', 'knn'] quality_metric = ClassificationMetricsEnum.ROCAUC req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=5, pop_size=5, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5, start_depth=2) scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPGraphOptimiserParameters( genetic_scheme_type=scheme_type) builder = GPComposerBuilder(task=Task( TaskTypesEnum.classification)).with_requirements(req).with_metrics( quality_metric).with_optimiser_parameters(optimiser_parameters) composer = builder.build() composer.compose_pipeline(data=dataset_to_compose, is_visualise=True) assert all( [ind.graph.depth <= 3 for ind in composer.history.individuals[0]]) assert composer.optimiser.max_depth == 5
def test_gp_composer_builder(): task = Task(TaskTypesEnum.classification) available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=5, num_of_generations=4, crossover_prob=0.8, mutation_prob=1, max_lead_time=datetime.timedelta(minutes=5)) scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) builder_with_custom_params = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters(optimiser_parameters) composer_with_custom_params = builder_with_custom_params.build() assert composer_with_custom_params.optimiser.parameters.genetic_scheme_type == scheme_type assert composer_with_custom_params.metrics == metric_function assert composer_with_custom_params.composer_requirements.pop_size == 5 assert composer_with_custom_params.composer_requirements.mutation_prob == 1 builder_with_default_params = GPComposerBuilder(task=task) composer_with_default_params = builder_with_default_params.build() default_metric = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC.ROCAUC_penalty) assert composer_with_default_params.optimiser.parameters.genetic_scheme_type == GeneticSchemeTypesEnum.generational assert composer_with_default_params.metrics == default_metric assert composer_with_default_params.composer_requirements.pop_size == 20 assert composer_with_default_params.composer_requirements.mutation_prob == 0.8
def test_evaluate_individuals(): project_root_path = str(fedot_project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') full_path_train = os.path.join(str(fedot_project_root()), file_path_train) task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(full_path_train, task=task) available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=task.task_type) metric_function = ClassificationMetricsEnum.ROCAUC_penalty composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function) composer = builder.build() pipelines_to_evaluate = [ pipeline_first(), pipeline_second(), pipeline_third(), pipeline_fourth() ] train_data, test_data = train_test_data_setup( dataset_to_compose, sample_split_ratio_for_tasks[dataset_to_compose.task.task_type]) metric_function_for_nodes = partial(composer.composer_metric, composer.metrics, train_data, test_data) adapter = PipelineAdapter() population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=0.001) params = GraphGenerationParams(adapter=PipelineAdapter(), advisor=PipelineChangeAdvisor()) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 1 assert population[0].fitness is not None population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate] timeout = datetime.timedelta(minutes=5) with OptimisationTimer(timeout=timeout) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, graph_generation_params=params, is_multi_objective=False, timer=t) assert len(population) == 4 assert all([ind.fitness is not None for ind in population])
def test_composition_time(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) task = Task(TaskTypesEnum.classification) models_impl = ['mlp', 'knn'] metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) req_terminated_evolution = GPComposerRequirements( primary=models_impl, secondary=models_impl, max_arity=2, max_depth=2, pop_size=2, num_of_generations=5, crossover_prob=0.9, mutation_prob=0.9, max_lead_time=datetime.timedelta(minutes=0.000001)) builder = GPComposerBuilder(task).with_requirements( req_terminated_evolution).with_metrics(metric_function) gp_composer_terminated_evolution = builder.build() _ = gp_composer_terminated_evolution.compose_chain(data=data) req_completed_evolution = GPComposerRequirements(primary=models_impl, secondary=models_impl, max_arity=2, max_depth=2, pop_size=2, num_of_generations=2, crossover_prob=0.4, mutation_prob=0.5) builder = GPComposerBuilder(task).with_requirements( req_completed_evolution).with_metrics(metric_function) gp_composer_completed_evolution = builder.build() _ = gp_composer_completed_evolution.compose_chain(data=data) assert len(gp_composer_terminated_evolution.history.chains) == len( gp_composer_completed_evolution.history.chains)
def test_gp_composer_builder_default_params_correct(): task = Task(TaskTypesEnum.regression) builder = GPComposerBuilder(task=task) # Initialise default parameters builder.set_default_composer_params() composer_with_default_params = builder.build() # Get default available operations for regression task primary_operations = composer_with_default_params.composer_requirements.primary # Data operations and models must be in this default primary operations list assert 'ridge' in primary_operations assert 'scaling' in primary_operations
def test_evaluate_individuals(): project_root_path = str(project_root()) file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv') full_path_train = os.path.join(str(project_root()), file_path_train) task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(full_path_train, task=task) available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=task.task_type) metric_function = ClassificationMetricsEnum.ROCAUC_penalty composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function) composer = builder.build() train_data, test_data = train_test_data_setup( dataset_to_compose, sample_split_ration_for_tasks[dataset_to_compose.task.task_type]) metric_function_for_nodes = partial(composer.composer_metric, composer.metrics, train_data, test_data) population = [chain_first(), chain_second(), chain_third(), chain_fourth()] max_lead_time = datetime.timedelta(minutes=0.001) with CompositionTimer(max_lead_time=max_lead_time) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, is_multi_objective=False, timer=t) assert len(population) == 1 assert population[0].fitness is not None population = [chain_first(), chain_second(), chain_third(), chain_fourth()] max_lead_time = datetime.timedelta(minutes=5) with CompositionTimer(max_lead_time=max_lead_time) as t: evaluate_individuals(individuals_set=population, objective_function=metric_function_for_nodes, is_multi_objective=False, timer=t) assert len(population) == 4 assert all([ind.fitness is not None for ind in population])
def test_cv_ts_and_cluster_raise(): task = Task(task_type=TaskTypesEnum.clustering) dataset_to_compose, dataset_to_validate = get_data(task) metric_function = ClusteringMetricsEnum.silhouette operations_repo = OperationTypesRepository() available_model_types, _ = operations_repo.suitable_operation( task_type=task.task_type) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, cv_folds=4) builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() with pytest.raises(NotImplementedError): composer.compose_pipeline(data=dataset_to_compose, is_visualise=False)
def _get_gp_composer_builder(task: Task, metric_function, composer_requirements: GPComposerRequirements, optimizer_parameters: GPChainOptimiserParameters, logger: Log): """ Return GPComposerBuilder with parameters and if it is necessary init_chain in it """ builder = GPComposerBuilder(task=task). \ with_requirements(composer_requirements). \ with_optimiser_parameters(optimizer_parameters). \ with_metrics(metric_function).with_logger(logger) init_chain = _obtain_initial_assumption(task) if init_chain is not None: builder = builder.with_initial_chain(init_chain) return builder
def test_multi_objective_composer(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=TaskTypesEnum.classification) quality_metric = ClassificationMetricsEnum.ROCAUC complexity_metric = ComplexityMetricsEnum.node_num metrics = [quality_metric, complexity_metric] req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5) scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPGraphOptimiserParameters( genetic_scheme_type=scheme_type, selection_types=[SelectionTypesEnum.nsga2]) builder = GPComposerBuilder(task=Task( TaskTypesEnum.classification)).with_requirements(req).with_metrics( metrics).with_optimiser_parameters(optimiser_parameters) composer = builder.build() pipelines_evo_composed = composer.compose_pipeline(data=dataset_to_compose) pipelines_roc_auc = [] for pipeline_evo_composed in pipelines_evo_composed: pipeline_evo_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_gp_composed = pipeline_evo_composed.predict( dataset_to_validate) roc_on_valid_gp_composed = roc_auc( y_true=dataset_to_validate.target, y_score=predicted_gp_composed.predict) pipelines_roc_auc.append(roc_on_valid_gp_composed) assert type(composer.metrics) is list and len(composer.metrics) > 1 assert type(pipelines_evo_composed) is list assert composer.optimiser.parameters.multi_objective assert all([roc_auc > 0.6 for roc_auc in pipelines_roc_auc])
def _get_gp_composer_builder(task: Task, metric_function, composer_requirements: GPComposerRequirements, optimizer_parameters: GPGraphOptimiserParameters, data: Union[InputData, MultiModalData], initial_pipeline: Pipeline, logger: Log): """ Return GPComposerBuilder with parameters and if it is necessary init_pipeline in it """ builder = GPComposerBuilder(task=task). \ with_requirements(composer_requirements). \ with_optimiser_parameters(optimizer_parameters). \ with_metrics(metric_function).with_logger(logger) init_pipeline = _obtain_initial_assumption(task, data) if not initial_pipeline else initial_pipeline if init_pipeline is not None: builder = builder.with_initial_pipeline(init_pipeline) return builder
def test_parameter_free_composer_build_pipeline_correct(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types, _ = OperationTypesRepository().suitable_operation( task_type=TaskTypesEnum.classification) metric_function = ClassificationMetricsEnum.ROCAUC req = GPComposerRequirements(primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=2, num_of_generations=4, crossover_prob=0.4, mutation_prob=0.5) opt_params = GPGraphOptimiserParameters( genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free) builder = GPComposerBuilder(task=Task( TaskTypesEnum.classification)).with_requirements(req).with_metrics( metric_function).with_optimiser_parameters(opt_params) gp_composer = builder.build() pipeline_gp_composed = gp_composer.compose_pipeline( data=dataset_to_compose) pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate) roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target, y_score=predicted_gp_composed.predict) population_len = sum([ len(history) for history in gp_composer.history.individuals ]) / len(gp_composer.history.individuals) assert population_len != len(gp_composer.history.individuals[0]) assert roc_on_valid_gp_composed > 0.6
def test_gp_composer_saving_info_from_process(data_fixture, request): data = request.getfixturevalue(data_fixture) dataset_to_compose = data available_model_types = ['xgboost', 'knn'] quality_metric = ClassificationMetricsEnum.ROCAUC req = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5, start_depth=2, max_chain_fit_time=datetime.timedelta(minutes=5), allow_single_operations=False) scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) builder = GPComposerBuilder( task=Task(TaskTypesEnum.classification)).with_requirements( req).with_metrics(quality_metric).with_optimiser_parameters( optimiser_parameters).with_cache() composer = builder.build() train_data, test_data = train_test_data_setup( data, sample_split_ration_for_tasks[data.task.task_type]) composer.compose_chain(data=dataset_to_compose, is_visualise=True) with shelve.open(composer.cache.db_path) as cache: global_cache_len_before = len(cache.dict) new_chain = chain_first() composer.composer_metric([quality_metric], dataset_to_compose, test_data, new_chain) with shelve.open(composer.cache.db_path) as cache: global_cache_len_after = len(cache.dict) assert global_cache_len_before < global_cache_len_after assert new_chain.computation_time is not None assert new_chain.fitted_on_data is not None
def run_multi_modal_case(files_path, is_visualise=False, timeout=datetime.timedelta(minutes=2)): task = Task(TaskTypesEnum.classification) images_size = (128, 128) train_num, test_num, train_img, test_img, train_text, test_text = prepare_multi_modal_data( files_path, task, images_size) pipeline, fit_data, predict_data = generate_initial_pipeline_and_data( images_size, train_num, test_num, train_img, test_img, train_text, test_text) # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task available_model_types = get_operations_for_task(task=task, mode='model') # the choice of the metric for the pipeline quality assessment during composition metric_function = ClassificationMetricsEnum.ROCAUC_penalty # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=5, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.8, timeout=timeout) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPGraphOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params logger = default_log('FEDOT logger', verbose_level=4) # the multi modal template (with data sources) is passed as inital assumption for composer builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger). \ with_initial_pipeline(pipeline).with_cache('multi_modal_opt.cache') # Create GP-based composer composer = builder.build() # the optimal pipeline generation by composition - the most time-consuming task pipeline_evo_composed = composer.compose_pipeline(data=fit_data, is_visualise=True) pipeline_evo_composed.fit(input_data=fit_data) if is_visualise: pipeline_evo_composed.show() prediction = pipeline_evo_composed.predict(predict_data) err = calculate_validation_metric(prediction, test_num) print(f'ROC AUC for validation sample is {err}') return err
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False, with_tuning=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC_penalty) # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.steady_state optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters(optimiser_parameters) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) if with_tuning: chain_evo_composed.fine_tune_primary_nodes( input_data=dataset_to_compose, iterations=50, verbose=True) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) if is_visualise: visualiser = ChainVisualiser() composer.log.info('History visualization started') visualiser.visualise_history(composer.history) composer.log.info('History visualization finished') composer.history.write_composer_history_to_csv() composer.log.info('Best chain visualization started') visualiser.visualise(chain_evo_composed) composer.log.info('Best chain visualization finished') # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types = get_operations_for_task(task=task, mode='models') # the choice of the metric for the chain quality assessment during composition quality_metric = ClassificationMetricsEnum.ROCAUC complexity_metric = ComplexityMetricsEnum.node_num metrics = [quality_metric, complexity_metric] # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time, start_depth=2, allow_single_operations=False) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type, selection_types=[SelectionTypesEnum.spea2]) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metrics).with_optimiser_parameters(optimiser_parameters) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chains_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) composer.history.write_composer_history_to_csv() if is_visualise: results_visualization(composed_chains=chains_evo_composed, history=composer.history) chains_roc_auc = [] for chain_num, chain_evo_composed in enumerate(chains_evo_composed): chain_evo_composed.fine_tune_primary_nodes( input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose) # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) chains_roc_auc.append(roc_on_valid_evo_composed) if len(chains_evo_composed) > 1: print( f'Composed ROC AUC of chain {chain_num + 1} is {round(roc_on_valid_evo_composed, 3)}' ) else: print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return max(chains_roc_auc)
def run_fedot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task_type = params.task if task_type == TaskTypesEnum.classification: metric = ClassificationMetricsEnum.ROCAUC elif task_type == TaskTypesEnum.regression: metric = RegressionMetricsEnum.RMSE else: raise NotImplementedError() metric_func = MetricsRepository().metric_by_id(metric) task = Task(task_type) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) models_hyperparameters = get_models_hyperparameters()['FEDOT'] cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS'] saved_model_name = f'fedot_{case_label}_{task_type.name}_{cur_lead_time}_{metric.name}' loaded_model = load_fedot_model(saved_model_name) if not loaded_model: generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] # the search of the models provided by the framework that can be used as nodes in a chain' models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model(task.task_type) heavy_models = ['svc', 'multinb', 'tfidf', 'qda'] available_model_types = [ model for model in available_model_types if model not in heavy_models ] # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=2, pop_size=population_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=cur_lead_time), add_single_model_chains=True) # Create GP-based composer builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_func) gp_composer = builder.build() chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose) chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose) save_fedot_model(chain_gp_composed, saved_model_name) else: chain_gp_composed = loaded_model evo_predicted = chain_gp_composed.predict(dataset_to_validate) evo_predicted_labels = chain_gp_composed.predict(dataset_to_validate, output_mode='labels') return dataset_to_validate.target, evo_predicted.predict, evo_predicted_labels.predict
def make_forecast(df, len_forecast: int): """ Function for making time series forecasting with AutoTS library :param df: dataframe to process :param len_forecast: forecast length :return predicted_values: forecast :return model_name: name of the model (always 'AutoTS') """ time_series = np.array(df['value']) train_input, predict_input, task = prepare_input_data(len_forecast=len_forecast, train_data_features=time_series, train_data_target=time_series, test_data_features=time_series) # Get chain with pre-defined structure init_chain = get_source_chain() # Init check preds = fit_predict_for_chain(chain=init_chain, train_input=train_input, predict_input=predict_input) # Get available_operations type primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=7, pop_size=10, num_of_generations=10, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=5), allow_single_operations=False) mutation_types = [MutationTypesEnum.parameter_change, MutationTypesEnum.simple, MutationTypesEnum.reduce] optimiser_parameters = GPChainOptimiserParameters( mutation_types=mutation_types) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.MAE) builder = GPComposerBuilder(task=task). \ with_optimiser_parameters(optimiser_parameters). \ with_requirements(composer_requirements). \ with_metrics(metric_function).with_initial_chain(init_chain) composer = builder.build() obtained_chain = composer.compose_chain(data=train_input, is_visualise=False) chain_tuner = ChainTuner(chain=obtained_chain, task=task, iterations=10) tuned_chain = chain_tuner.tune_chain(input_data=train_input, loss_function=mean_squared_error, loss_params={'squared': False}) preds = fit_predict_for_chain(chain=tuned_chain, train_input=train_input, predict_input=predict_input) list_with_nodes = display_chain_info(obtained_chain) model_name = str(list_with_nodes) return preds, model_name
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False, with_tuning=False, cache_path=None): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types = get_operations_for_task(task=task, mode='models') # the choice of the metric for the chain quality assessment during composition metric_function = ClassificationMetricsEnum.ROCAUC_penalty # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params logger = default_log('FEDOT logger', verbose_level=4) builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \ with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger) if cache_path: builder = builder.with_cache(cache_path) # Create GP-based composer composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=True) if with_tuning: # TODO Add tuning raise NotImplementedError(f'Tuning is not supported') chain_evo_composed.fit(input_data=dataset_to_compose) composer.history.write_composer_history_to_csv() if is_visualise: visualiser = ChainVisualiser() composer.log.debug('History visualization started') visualiser.visualise_history(composer.history) composer.log.debug('History visualization finished') composer.log.debug('Best chain visualization started') visualiser.visualise(chain_evo_composed) composer.log.debug('Best chain visualization finished') # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def run_river_composer_experiment(file_path, init_pipeline, file_to_save, iterations=20, tuner=None): """ Function launch experiment for river level prediction. Composing and tuner processes are available for such experiment. :param file_path: path to the file with river level data :param init_pipeline: pipeline to start composing process :param file_to_save: path to the file and file name to save report :param iterations: amount of iterations to process :param tuner: if tuning after composing process is required or not. tuner - NodesTuner or PipelineTuner. """ # Read dataframe and prepare train and test data data = InputData.from_csv(file_path, target_columns='level_station_2', task=Task(TaskTypesEnum.regression), columns_to_drop=['date']) train_input, predict_input = train_test_data_setup(data) y_data_test = np.array(predict_input.target) available_secondary_operations = ['ridge', 'lasso', 'dtreg', 'xgbreg', 'adareg', 'knnreg', 'linear', 'svr', 'poly_features', 'scaling', 'ransac_lin_reg', 'rfe_lin_reg', 'pca', 'ransac_non_lin_reg', 'rfe_non_lin_reg', 'normalization'] available_primary_operations = ['one_hot_encoding'] # Report arrays obtained_pipelines = [] depths = [] maes = [] for i in range(0, iterations): print(f'Iteration {i}\n') composer_requirements = GPComposerRequirements( primary=available_primary_operations, secondary=available_secondary_operations, max_arity=3, max_depth=8, pop_size=10, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.8, timeout=datetime.timedelta(minutes=5)) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.MAE) builder = GPComposerBuilder(task=data.task). \ with_requirements(composer_requirements). \ with_metrics(metric_function).with_initial_pipeline(init_pipeline) composer = builder.build() obtained_pipeline = composer.compose_pipeline(data=train_input, is_visualise=False) # Display info about obtained pipeline obtained_models, depth = get_pipeline_info(pipeline=obtained_pipeline) preds = fit_predict_for_pipeline(pipeline=obtained_pipeline, train_input=train_input, predict_input=predict_input) mse_value = mean_squared_error(y_data_test, preds, squared=False) mae_value = mean_absolute_error(y_data_test, preds) print(f'Obtained metrics for current iteration {i}:') print(f'RMSE - {mse_value:.2f}') print(f'MAE - {mae_value:.2f}\n') if tuner is not None: print(f'Start tuning process ...') pipeline_tuner = tuner(pipeline=obtained_pipeline, task=data.task, iterations=100) tuned_pipeline = pipeline_tuner.tune_pipeline(input_data=train_input, loss_function=mean_absolute_error) preds_tuned = fit_predict_for_pipeline(pipeline=tuned_pipeline, train_input=train_input, predict_input=predict_input) mse_value = mean_squared_error(y_data_test, preds_tuned, squared=False) mae_value = mean_absolute_error(y_data_test, preds_tuned) print(f'Obtained metrics for current iteration {i} after tuning:') print(f'RMSE - {mse_value:.2f}') print(f'MAE - {mae_value:.2f}\n') obtained_pipelines.append(obtained_models) maes.append(mae_value) depths.append(depth) report = pd.DataFrame({'Pipeline': obtained_pipelines, 'Depth': depths, 'MAE': maes}) report.to_csv(file_to_save, index=False)
def run(dataset, config): log.info("\n**** FEDOT ****\n") is_classification = config.type == 'classification' # Mapping of benchmark metrics to FEDOT metrics metrics_mapping = dict(acc='accuracy', auc='roc_auc', f1='f1', logloss='neg_log_loss', mae='neg_mean_absolute_error', mse='neg_mean_squared_error', msle='neg_mean_squared_log_error', r2='r2', rmse='neg_mean_squared_error') scoring_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if scoring_metric is None: raise ValueError("Performance metric {} not supported.".format( config.metric)) if is_classification: metric = ClassificationMetricsEnum.ROCAUC task_type = TaskTypesEnum.classification else: metric = RegressionMetricsEnum.RMSE task_type = TaskTypesEnum.regression task = Task(task_type) x_train = dataset.train.X_enc y_train = dataset.train.y_enc x_test = dataset.test.X_enc x_train, y_train = shuffle(dataset.train.X_enc, dataset.train.y_enc, random_state=0) if len(y_train.shape) > 1 and y_train.shape[1] == 1: y_train = np.squeeze(y_train, axis=1) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } dataset_to_compose = \ InputData(idx=[_ for _ in range(len(y_train))], features=x_train, target=y_train, task=task, data_type=DataTypesEnum.table) dataset_to_test = \ InputData(idx=[_ for _ in range(len(y_train))], features=x_test, target=None, task=task, data_type=DataTypesEnum.table) n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config log.info( 'Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.', config.max_runtime_seconds, n_jobs, scoring_metric) runtime_min = (config.max_runtime_seconds / 60) available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) metric_function = MetricsRepository().metric_by_id(metric) # Create GP-based composer composer = GPComposer() if False: # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, max_lead_time=datetime.timedelta(minutes=runtime_min * 0.8)) # GP optimiser parameters choice scheme_type = GeneticSchemeTypesEnum.parameter_free optimiser_parameters = GPChainOptimiserParameters( genetic_scheme_type=scheme_type) # Create builder for composer and set composer params builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_optimiser_parameters( optimiser_parameters) composer = builder.build() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=False) else: if is_classification: chain_evo_composed = Chain(PrimaryNode('logit')) else: chain_evo_composed = Chain(PrimaryNode('lasso')) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False) log.info('Predicting on the test set.') y_test = dataset.test.y_enc predictions = chain_evo_composed.predict(dataset_to_test, output_mode='labels').predict if not is_classification: probabilities = None else: probabilities = chain_evo_composed.predict( dataset_to_test, output_mode='full_probs').predict return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, target_is_encoded=is_classification, models_count=1, training_duration=1)
def run_ts_forecasting_problem(forecast_length=50, with_visualisation=True, cv_folds=None) -> None: """ Function launch time series task with composing :param forecast_length: length of the forecast :param with_visualisation: is it needed to show the plots :param cv_folds: is it needed apply cross validation and what number of folds to use """ file_path = '../cases/data/metocean/metocean_data_test.csv' df = pd.read_csv(file_path) time_series = np.array(df['sea_height']) # Train/test split train_part = time_series[:-forecast_length] test_part = time_series[-forecast_length:] # Prepare data for train and test train_input, predict_input, task = prepare_train_test_input( train_part, forecast_length) # Get pipeline with pre-defined structure init_pipeline = get_source_pipeline() # Init check preds = fit_predict_for_pipeline(pipeline=init_pipeline, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) # Get available_operations type primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=8, pop_size=10, num_of_generations=10, crossover_prob=0.8, mutation_prob=0.8, timeout=datetime.timedelta(minutes=10), cv_folds=cv_folds, validation_blocks=3) mutation_types = [ parameter_change_mutation, MutationTypesEnum.simple, MutationTypesEnum.reduce ] optimiser_parameters = GPGraphOptimiserParameters( mutation_types=mutation_types) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) builder = GPComposerBuilder(task=task). \ with_optimiser_parameters(optimiser_parameters).\ with_requirements(composer_requirements).\ with_metrics(metric_function).with_initial_pipeline(init_pipeline) composer = builder.build() obtained_pipeline = composer.compose_pipeline(data=train_input, is_visualise=False) ################################### # Obtained pipeline visualisation # ################################### if with_visualisation: obtained_pipeline.show() preds = fit_predict_for_pipeline(pipeline=obtained_pipeline, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) obtained_pipeline.print_structure()
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, with_visualisation=True): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) time_limit_min = 10 available_model_types = [ 'linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr' ] if max_window_size == 1: # unit test model available_model_types = ['linear', 'ridge'] time_limit_min = 0.001 # each possible single-model chain for model in available_model_types: chain = TsForecastingChain(PrimaryNode(model)) chain.fit(input_data=dataset_to_train, verbose=False) calculate_validation_metric(chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label=model) # static multiscale chain multiscale_chain = get_composite_multiscale_chain() multiscale_chain.fit(input_data=dataset_to_train, verbose=False) calculate_validation_metric(multiscale_chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Fixed multiscale') # static all-in-one ensemble chain ens_chain = get_ensemble_chain() ens_chain.fit(input_data=dataset_to_train, verbose=False) calculate_validation_metric(ens_chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Ensemble composite') # optimized ensemble chain composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=5, max_depth=2, pop_size=10, num_of_generations=10, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=time_limit_min), add_single_model_chains=False) builder = GPComposerBuilder(task=task_to_solve).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() chain = composer.compose_chain(data=dataset_to_train, is_visualise=False) chain.fit_from_scratch(input_data=dataset_to_train, verbose=False) if with_visualisation: ComposerVisualiser.visualise(chain) calculate_validation_metric(chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Automated ensemble') # optimized multiscale chain available_model_types_primary = ['trend_data_model', 'residual_data_model'] available_model_types_secondary = [ 'linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr' ] available_model_types_all = available_model_types_primary + available_model_types_secondary composer_requirements = GPComposerRequirements( primary=available_model_types_all, secondary=available_model_types_secondary, max_arity=5, max_depth=2, pop_size=10, num_of_generations=30, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=time_limit_min)) builder = GPComposerBuilder(task=task_to_solve).with_requirements( composer_requirements).with_metrics( metric_function).with_initial_chain(multiscale_chain) composer = builder.build() chain = composer.compose_chain(data=dataset_to_train, is_visualise=False) chain.fit_from_scratch(input_data=dataset_to_train, verbose=False) if with_visualisation: visualiser = ChainVisualiser() visualiser.visualise(chain) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, is_visualise=with_visualisation, label='Automated multiscale') return rmse_on_valid
def run_ts_forecasting_problem(forecast_length=50, with_visualisation=True) -> None: """ Function launch time series task with composing :param forecast_length: length of the forecast :param with_visualisation: is it needed to show the plots """ file_path = '../cases/data/metocean/metocean_data_test.csv' df = pd.read_csv(file_path) time_series = np.array(df['sea_height']) # Train/test split train_part = time_series[:-forecast_length] test_part = time_series[-forecast_length:] # Prepare data for train and test train_input, predict_input, task = prepare_train_test_input( train_part, forecast_length) # Get chain with pre-defined structure init_chain = get_source_chain() # Init check preds = fit_predict_for_chain(chain=init_chain, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) # Get available_operations type primary_operations, secondary_operations = get_available_operations() # Composer parameters composer_requirements = GPComposerRequirements( primary=primary_operations, secondary=secondary_operations, max_arity=3, max_depth=8, pop_size=10, num_of_generations=15, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=10), allow_single_operations=False) mutation_types = [ MutationTypesEnum.parameter_change, MutationTypesEnum.simple, MutationTypesEnum.reduce ] optimiser_parameters = GPChainOptimiserParameters( mutation_types=mutation_types) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.MAE) builder = GPComposerBuilder(task=task). \ with_optimiser_parameters(optimiser_parameters).\ with_requirements(composer_requirements).\ with_metrics(metric_function).with_initial_chain(init_chain) composer = builder.build() obtained_chain = composer.compose_chain(data=train_input, is_visualise=False) ################################ # Obtained chain visualisation # ################################ if with_visualisation: visualiser = ChainVisualiser() visualiser.visualise(obtained_chain) preds = fit_predict_for_chain(chain=obtained_chain, train_input=train_input, predict_input=predict_input) display_validation_metric(predicted=preds, real=test_part, actual_values=time_series, is_visualise=with_visualisation) display_chain_info(obtained_chain)
def run_experiment(file_path, init_chain): # Read dataframe and prepare train and test data df = pd.read_csv(file_path) features = np.array(df[['level_station_1', 'mean_temp', 'month', 'precip']]) target = np.array(df['level_station_2']) x_data_train, x_data_test, y_data_train, y_data_test = train_test_split( features, target, test_size=0.2, shuffle=True, random_state=10) y_data_test = np.ravel(y_data_test) # Define regression task task = Task(TaskTypesEnum.regression) # Prepare data to train the model train_input = InputData(idx=np.arange(0, len(x_data_train)), features=x_data_train, target=y_data_train, task=task, data_type=DataTypesEnum.table) predict_input = InputData(idx=np.arange(0, len(x_data_test)), features=x_data_test, target=None, task=task, data_type=DataTypesEnum.table) available_operations_types = [ 'ridge', 'lasso', 'dtreg', 'xgbreg', 'adareg', 'knnreg', 'linear', 'svr', 'poly_features', 'scaling', 'ransac_lin_reg', 'rfe_lin_reg', 'pca', 'ransac_non_lin_reg', 'rfe_non_lin_reg', 'normalization' ] composer_requirements = GPComposerRequirements( primary=['one_hot_encoding'], secondary=available_operations_types, max_arity=3, max_depth=8, pop_size=10, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=5), allow_single_operations=True) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.MAE) builder = GPComposerBuilder( task=task).with_requirements(composer_requirements).with_metrics( metric_function).with_initial_chain(init_chain) composer = builder.build() obtained_chain = composer.compose_chain(data=train_input, is_visualise=False) # Display info about obtained chain obtained_models, depth = get_chain_info(chain=obtained_chain) preds = fit_predict_for_chain(chain=obtained_chain, train_input=train_input, predict_input=predict_input) mse_value = mean_squared_error(y_data_test, preds, squared=False) mae_value = mean_absolute_error(y_data_test, preds) print(f'RMSE - {mse_value:.2f}') print(f'MAE - {mae_value:.2f}\n')