예제 #1
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type, tags=['simple'])

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
예제 #2
0
def test_parameter_free_composer_build_chain_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=TaskTypesEnum.classification)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    opt_params = GPChainOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metric_function).with_optimiser_parameters(opt_params)
    gp_composer = builder.build()
    chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose)

    chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = chain_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)

    assert roc_on_valid_gp_composed > 0.6
예제 #3
0
def test_composer_cv_correct():
    """ Checks if the composer works correctly when using cross validation for
    time series """
    folds = 2
    _, forecast_len, validation_blocks, time_series = configure_experiment()

    primary_operations, secondary_operations = get_available_operations()

    # Composer parameters
    composer_requirements = GPComposerRequirements(
        primary=primary_operations,
        secondary=secondary_operations,
        max_arity=3,
        max_depth=3,
        pop_size=2,
        num_of_generations=2,
        crossover_prob=0.8,
        mutation_prob=0.8,
        timeout=datetime.timedelta(seconds=5),
        cv_folds=folds,
        validation_blocks=validation_blocks)

    init_pipeline = get_simple_ts_pipeline()
    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)
    builder = GPComposerBuilder(task=time_series.task). \
        with_requirements(composer_requirements). \
        with_metrics(metric_function).with_initial_pipeline(init_pipeline)
    composer = builder.build()

    obtained_pipeline = composer.compose_pipeline(data=time_series,
                                                  is_visualise=False)
    assert isinstance(obtained_pipeline, Pipeline)
예제 #4
0
def test_composer_with_cv_optimization_correct():
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose, dataset_to_validate = get_data(task)

    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    metric_function = [
        ClassificationMetricsEnum.ROCAUC_penalty,
        ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss
    ]

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        timeout=timedelta(minutes=1),
        num_of_generations=2,
        cv_folds=3)

    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_compose,
                                                      is_visualise=False)[0]

    assert isinstance(pipeline_evo_composed, Pipeline)

    pipeline_evo_composed.fit(input_data=dataset_to_compose)
    predicted = pipeline_evo_composed.predict(dataset_to_validate)
    roc_on_valid_evo_composed = roc_auc(y_score=predicted.predict,
                                        y_true=dataset_to_validate.target)

    assert roc_on_valid_evo_composed > 0
예제 #5
0
def get_composed_chain(dataset_to_compose, task, metric_function):
    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types = get_operations_for_task(task=task, mode='models')

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        allow_single_operations=False)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=True)

    return chain_evo_composed
예제 #6
0
def test_gp_composer_build_pipeline_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    task = Task(TaskTypesEnum.classification)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)

    builder = GPComposerBuilder(task).with_requirements(req).with_metrics(
        metric_function)
    gp_composer = builder.build()
    pipeline_gp_composed = gp_composer.compose_pipeline(
        data=dataset_to_compose)

    pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)

    assert roc_on_valid_gp_composed > 0.6
예제 #7
0
def test_gp_composer_with_start_depth(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    available_model_types = ['xgboost', 'knn']
    quality_metric = ClassificationMetricsEnum.ROCAUC
    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=5,
                                 pop_size=5,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5,
                                 start_depth=2)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            quality_metric).with_optimiser_parameters(optimiser_parameters)
    composer = builder.build()
    composer.compose_pipeline(data=dataset_to_compose, is_visualise=True)
    assert all(
        [ind.graph.depth <= 3 for ind in composer.history.individuals[0]])
    assert composer.optimiser.max_depth == 5
예제 #8
0
def test_evaluate_individuals():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    pipelines_to_evaluate = [
        pipeline_first(),
        pipeline_second(),
        pipeline_third(),
        pipeline_fourth()
    ]

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ratio_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    adapter = PipelineAdapter()
    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=0.001)
    params = GraphGenerationParams(adapter=PipelineAdapter(),
                                   advisor=PipelineChangeAdvisor())
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=5)
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])
예제 #9
0
def test_gp_composer_builder_default_params_correct():
    task = Task(TaskTypesEnum.regression)
    builder = GPComposerBuilder(task=task)

    # Initialise default parameters
    builder.set_default_composer_params()
    composer_with_default_params = builder.build()

    # Get default available operations for regression task
    primary_operations = composer_with_default_params.composer_requirements.primary

    # Data operations and models must be in this default primary operations list
    assert 'ridge' in primary_operations
    assert 'scaling' in primary_operations
예제 #10
0
def test_evaluate_individuals():
    project_root_path = str(project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ration_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    population = [chain_first(), chain_second(), chain_third(), chain_fourth()]
    max_lead_time = datetime.timedelta(minutes=0.001)
    with CompositionTimer(max_lead_time=max_lead_time) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [chain_first(), chain_second(), chain_third(), chain_fourth()]
    max_lead_time = datetime.timedelta(minutes=5)
    with CompositionTimer(max_lead_time=max_lead_time) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])
예제 #11
0
def test_cv_ts_and_cluster_raise():
    task = Task(task_type=TaskTypesEnum.clustering)
    dataset_to_compose, dataset_to_validate = get_data(task)
    metric_function = ClusteringMetricsEnum.silhouette

    operations_repo = OperationTypesRepository()
    available_model_types, _ = operations_repo.suitable_operation(
        task_type=task.task_type)
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        cv_folds=4)
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    with pytest.raises(NotImplementedError):
        composer.compose_pipeline(data=dataset_to_compose, is_visualise=False)
예제 #12
0
def _get_gp_composer_builder(task: Task, metric_function,
                             composer_requirements: GPComposerRequirements,
                             optimizer_parameters: GPChainOptimiserParameters,
                             logger: Log):
    """ Return GPComposerBuilder with parameters and if it is necessary
    init_chain in it """

    builder = GPComposerBuilder(task=task). \
        with_requirements(composer_requirements). \
        with_optimiser_parameters(optimizer_parameters). \
        with_metrics(metric_function).with_logger(logger)

    init_chain = _obtain_initial_assumption(task)

    if init_chain is not None:
        builder = builder.with_initial_chain(init_chain)

    return builder
예제 #13
0
def test_multi_objective_composer(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)
    quality_metric = ClassificationMetricsEnum.ROCAUC
    complexity_metric = ComplexityMetricsEnum.node_num
    metrics = [quality_metric, complexity_metric]
    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type,
        selection_types=[SelectionTypesEnum.nsga2])
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metrics).with_optimiser_parameters(optimiser_parameters)
    composer = builder.build()
    pipelines_evo_composed = composer.compose_pipeline(data=dataset_to_compose)
    pipelines_roc_auc = []
    for pipeline_evo_composed in pipelines_evo_composed:
        pipeline_evo_composed.fit_from_scratch(input_data=dataset_to_compose)
        predicted_gp_composed = pipeline_evo_composed.predict(
            dataset_to_validate)

        roc_on_valid_gp_composed = roc_auc(
            y_true=dataset_to_validate.target,
            y_score=predicted_gp_composed.predict)

        pipelines_roc_auc.append(roc_on_valid_gp_composed)

    assert type(composer.metrics) is list and len(composer.metrics) > 1
    assert type(pipelines_evo_composed) is list
    assert composer.optimiser.parameters.multi_objective
    assert all([roc_auc > 0.6 for roc_auc in pipelines_roc_auc])
예제 #14
0
def _get_gp_composer_builder(task: Task, metric_function,
                             composer_requirements: GPComposerRequirements,
                             optimizer_parameters: GPGraphOptimiserParameters,
                             data: Union[InputData, MultiModalData],
                             initial_pipeline: Pipeline,
                             logger: Log):
    """ Return GPComposerBuilder with parameters and if it is necessary
    init_pipeline in it """

    builder = GPComposerBuilder(task=task). \
        with_requirements(composer_requirements). \
        with_optimiser_parameters(optimizer_parameters). \
        with_metrics(metric_function).with_logger(logger)

    init_pipeline = _obtain_initial_assumption(task, data) if not initial_pipeline else initial_pipeline

    if init_pipeline is not None:
        builder = builder.with_initial_pipeline(init_pipeline)

    return builder
예제 #15
0
def test_gp_composer_saving_info_from_process(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    available_model_types = ['xgboost', 'knn']
    quality_metric = ClassificationMetricsEnum.ROCAUC
    req = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=2,
        max_depth=2,
        pop_size=2,
        num_of_generations=1,
        crossover_prob=0.4,
        mutation_prob=0.5,
        start_depth=2,
        max_chain_fit_time=datetime.timedelta(minutes=5),
        allow_single_operations=False)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)
    builder = GPComposerBuilder(
        task=Task(TaskTypesEnum.classification)).with_requirements(
            req).with_metrics(quality_metric).with_optimiser_parameters(
                optimiser_parameters).with_cache()
    composer = builder.build()
    train_data, test_data = train_test_data_setup(
        data, sample_split_ration_for_tasks[data.task.task_type])
    composer.compose_chain(data=dataset_to_compose, is_visualise=True)
    with shelve.open(composer.cache.db_path) as cache:
        global_cache_len_before = len(cache.dict)
    new_chain = chain_first()
    composer.composer_metric([quality_metric], dataset_to_compose, test_data,
                             new_chain)
    with shelve.open(composer.cache.db_path) as cache:
        global_cache_len_after = len(cache.dict)
    assert global_cache_len_before < global_cache_len_after
    assert new_chain.computation_time is not None
    assert new_chain.fitted_on_data is not None
예제 #16
0
def test_parameter_free_composer_build_pipeline_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)

    metric_function = ClassificationMetricsEnum.ROCAUC

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=4,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    opt_params = GPGraphOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metric_function).with_optimiser_parameters(opt_params)
    gp_composer = builder.build()
    pipeline_gp_composed = gp_composer.compose_pipeline(
        data=dataset_to_compose)

    pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)
    population_len = sum([
        len(history) for history in gp_composer.history.individuals
    ]) / len(gp_composer.history.individuals)
    assert population_len != len(gp_composer.history.individuals[0])
    assert roc_on_valid_gp_composed > 0.6
예제 #17
0
def test_gp_composer_builder():
    task = Task(TaskTypesEnum.classification)

    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=5,
        num_of_generations=4,
        crossover_prob=0.8,
        mutation_prob=1,
        max_lead_time=datetime.timedelta(minutes=5))

    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    builder_with_custom_params = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    composer_with_custom_params = builder_with_custom_params.build()

    assert composer_with_custom_params.optimiser.parameters.genetic_scheme_type == scheme_type
    assert composer_with_custom_params.metrics == metric_function
    assert composer_with_custom_params.composer_requirements.pop_size == 5
    assert composer_with_custom_params.composer_requirements.mutation_prob == 1

    builder_with_default_params = GPComposerBuilder(task=task)
    composer_with_default_params = builder_with_default_params.build()

    default_metric = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC.ROCAUC_penalty)

    assert composer_with_default_params.optimiser.parameters.genetic_scheme_type == GeneticSchemeTypesEnum.generational
    assert composer_with_default_params.metrics == default_metric
    assert composer_with_default_params.composer_requirements.pop_size == 20
    assert composer_with_default_params.composer_requirements.mutation_prob == 0.8
예제 #18
0
def test_composition_time(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    task = Task(TaskTypesEnum.classification)
    models_impl = ['mlp', 'knn']
    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    req_terminated_evolution = GPComposerRequirements(
        primary=models_impl,
        secondary=models_impl,
        max_arity=2,
        max_depth=2,
        pop_size=2,
        num_of_generations=5,
        crossover_prob=0.9,
        mutation_prob=0.9,
        max_lead_time=datetime.timedelta(minutes=0.000001))

    builder = GPComposerBuilder(task).with_requirements(
        req_terminated_evolution).with_metrics(metric_function)

    gp_composer_terminated_evolution = builder.build()

    _ = gp_composer_terminated_evolution.compose_chain(data=data)

    req_completed_evolution = GPComposerRequirements(primary=models_impl,
                                                     secondary=models_impl,
                                                     max_arity=2,
                                                     max_depth=2,
                                                     pop_size=2,
                                                     num_of_generations=2,
                                                     crossover_prob=0.4,
                                                     mutation_prob=0.5)

    builder = GPComposerBuilder(task).with_requirements(
        req_completed_evolution).with_metrics(metric_function)
    gp_composer_completed_evolution = builder.build()

    _ = gp_composer_completed_evolution.compose_chain(data=data)

    assert len(gp_composer_terminated_evolution.history.chains) == len(
        gp_composer_completed_evolution.history.chains)
예제 #19
0
def run_multi_modal_case(files_path,
                         is_visualise=False,
                         timeout=datetime.timedelta(minutes=2)):
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    train_num, test_num, train_img, test_img, train_text, test_text = prepare_multi_modal_data(
        files_path, task, images_size)

    pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(
        images_size, train_num, test_num, train_img, test_img, train_text,
        test_text)

    # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task
    available_model_types = get_operations_for_task(task=task, mode='model')

    # the choice of the metric for the pipeline quality assessment during composition
    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=5,
        num_of_generations=5,
        crossover_prob=0.8,
        mutation_prob=0.8,
        timeout=timeout)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    logger = default_log('FEDOT logger', verbose_level=4)

    # the multi modal template (with data sources) is passed as inital assumption for composer
    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger). \
        with_initial_pipeline(pipeline).with_cache('multi_modal_opt.cache')

    # Create GP-based composer
    composer = builder.build()

    # the optimal pipeline generation by composition - the most time-consuming task
    pipeline_evo_composed = composer.compose_pipeline(data=fit_data,
                                                      is_visualise=True)

    pipeline_evo_composed.fit(input_data=fit_data)

    if is_visualise:
        pipeline_evo_composed.show()

    prediction = pipeline_evo_composed.predict(predict_data)

    err = calculate_validation_metric(prediction, test_num)

    print(f'ROC AUC for validation sample is {err}')

    return err
예제 #20
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False,
        with_tuning=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC_penalty)

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=True)

    if with_tuning:
        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50, verbose=True)

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    if is_visualise:
        visualiser = ChainVisualiser()

        composer.log.info('History visualization started')
        visualiser.visualise_history(composer.history)
        composer.log.info('History visualization finished')
        composer.history.write_composer_history_to_csv()

        composer.log.info('Best chain visualization started')
        visualiser.visualise(chain_evo_composed)
        composer.log.info('Best chain visualization finished')

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed
예제 #21
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types = get_operations_for_task(task=task, mode='models')

    # the choice of the metric for the chain quality assessment during composition
    quality_metric = ClassificationMetricsEnum.ROCAUC
    complexity_metric = ComplexityMetricsEnum.node_num
    metrics = [quality_metric, complexity_metric]
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time,
        start_depth=2,
        allow_single_operations=False)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type,
        selection_types=[SelectionTypesEnum.spea2])

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metrics).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chains_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                 is_visualise=True)

    composer.history.write_composer_history_to_csv()

    if is_visualise:
        results_visualization(composed_chains=chains_evo_composed,
                              history=composer.history)

    chains_roc_auc = []
    for chain_num, chain_evo_composed in enumerate(chains_evo_composed):

        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50)

        chain_evo_composed.fit(input_data=dataset_to_compose)

        # the quality assessment for the obtained composite models
        roc_on_valid_evo_composed = calculate_validation_metric(
            chain_evo_composed, dataset_to_validate)

        chains_roc_auc.append(roc_on_valid_evo_composed)
        if len(chains_evo_composed) > 1:
            print(
                f'Composed ROC AUC of chain {chain_num + 1} is {round(roc_on_valid_evo_composed, 3)}'
            )

        else:
            print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return max(chains_roc_auc)
예제 #22
0
def run(dataset, config):
    log.info("\n**** FEDOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to FEDOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2',
                           rmse='neg_mean_squared_error')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    if is_classification:
        metric = ClassificationMetricsEnum.ROCAUC
        task_type = TaskTypesEnum.classification
    else:
        metric = RegressionMetricsEnum.RMSE
        task_type = TaskTypesEnum.regression

    task = Task(task_type)

    x_train = dataset.train.X_enc
    y_train = dataset.train.y_enc

    x_test = dataset.test.X_enc

    x_train, y_train = shuffle(dataset.train.X_enc,
                               dataset.train.y_enc,
                               random_state=0)

    if len(y_train.shape) > 1 and y_train.shape[1] == 1:
        y_train = np.squeeze(y_train, axis=1)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    dataset_to_compose = \
        InputData(idx=[_ for _ in range(len(y_train))],
                  features=x_train,
                  target=y_train,
                  task=task,
                  data_type=DataTypesEnum.table)

    dataset_to_test = \
        InputData(idx=[_ for _ in range(len(y_train))],
                  features=x_test,
                  target=None,
                  task=task,
                  data_type=DataTypesEnum.table)

    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository().metric_by_id(metric)

    # Create GP-based composer
    composer = GPComposer()

    if False:
        # the choice and initialisation of the GP search
        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=3,
            max_lead_time=datetime.timedelta(minutes=runtime_min * 0.8))

        # GP optimiser parameters choice
        scheme_type = GeneticSchemeTypesEnum.parameter_free
        optimiser_parameters = GPChainOptimiserParameters(
            genetic_scheme_type=scheme_type)

        # Create builder for composer and set composer params
        builder = GPComposerBuilder(
            task=task).with_requirements(composer_requirements).with_metrics(
                metric_function).with_optimiser_parameters(
                    optimiser_parameters)

        composer = builder.build()

        # the optimal chain generation by composition - the most time-consuming task
        chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                    is_visualise=False)

    else:
        if is_classification:
            chain_evo_composed = Chain(PrimaryNode('logit'))
        else:
            chain_evo_composed = Chain(PrimaryNode('lasso'))

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False)

    log.info('Predicting on the test set.')
    y_test = dataset.test.y_enc
    predictions = chain_evo_composed.predict(dataset_to_test,
                                             output_mode='labels').predict

    if not is_classification:
        probabilities = None
    else:
        probabilities = chain_evo_composed.predict(
            dataset_to_test, output_mode='full_probs').predict

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=1,
                  training_duration=1)
예제 #23
0
def run_fedot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task_type = params.task

    if task_type == TaskTypesEnum.classification:
        metric = ClassificationMetricsEnum.ROCAUC
    elif task_type == TaskTypesEnum.regression:
        metric = RegressionMetricsEnum.RMSE
    else:
        raise NotImplementedError()

    metric_func = MetricsRepository().metric_by_id(metric)

    task = Task(task_type)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    models_hyperparameters = get_models_hyperparameters()['FEDOT']
    cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS']

    saved_model_name = f'fedot_{case_label}_{task_type.name}_{cur_lead_time}_{metric.name}'
    loaded_model = load_fedot_model(saved_model_name)

    if not loaded_model:
        generations = models_hyperparameters['GENERATIONS']
        population_size = models_hyperparameters['POPULATION_SIZE']

        # the search of the models provided by the framework that can be used as nodes in a chain'
        models_repo = ModelTypesRepository()
        available_model_types, _ = models_repo.suitable_model(task.task_type)

        heavy_models = ['svc', 'multinb', 'tfidf', 'qda']
        available_model_types = [
            model for model in available_model_types
            if model not in heavy_models
        ]

        # the choice and initialisation of the GP search
        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=2,
            pop_size=population_size,
            num_of_generations=generations,
            crossover_prob=0.8,
            mutation_prob=0.8,
            max_lead_time=datetime.timedelta(minutes=cur_lead_time),
            add_single_model_chains=True)

        # Create GP-based composer
        builder = GPComposerBuilder(task).with_requirements(
            composer_requirements).with_metrics(metric_func)
        gp_composer = builder.build()

        chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose)

        chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
        save_fedot_model(chain_gp_composed, saved_model_name)
    else:
        chain_gp_composed = loaded_model

    evo_predicted = chain_gp_composed.predict(dataset_to_validate)
    evo_predicted_labels = chain_gp_composed.predict(dataset_to_validate,
                                                     output_mode='labels')

    return dataset_to_validate.target, evo_predicted.predict, evo_predicted_labels.predict
예제 #24
0
def make_forecast(df, len_forecast: int):
    """
    Function for making time series forecasting with AutoTS library

    :param df: dataframe to process
    :param len_forecast: forecast length

    :return predicted_values: forecast
    :return model_name: name of the model (always 'AutoTS')
    """

    time_series = np.array(df['value'])
    train_input, predict_input, task = prepare_input_data(len_forecast=len_forecast,
                                                          train_data_features=time_series,
                                                          train_data_target=time_series,
                                                          test_data_features=time_series)

    # Get chain with pre-defined structure
    init_chain = get_source_chain()

    # Init check
    preds = fit_predict_for_chain(chain=init_chain,
                                  train_input=train_input,
                                  predict_input=predict_input)

    # Get available_operations type
    primary_operations, secondary_operations = get_available_operations()

    # Composer parameters
    composer_requirements = GPComposerRequirements(
        primary=primary_operations,
        secondary=secondary_operations, max_arity=3,
        max_depth=7, pop_size=10, num_of_generations=10,
        crossover_prob=0.8, mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=5),
        allow_single_operations=False)

    mutation_types = [MutationTypesEnum.parameter_change,
                      MutationTypesEnum.simple,
                      MutationTypesEnum.reduce]
    optimiser_parameters = GPChainOptimiserParameters(
        mutation_types=mutation_types)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.MAE)
    builder = GPComposerBuilder(task=task). \
        with_optimiser_parameters(optimiser_parameters). \
        with_requirements(composer_requirements). \
        with_metrics(metric_function).with_initial_chain(init_chain)
    composer = builder.build()

    obtained_chain = composer.compose_chain(data=train_input,
                                            is_visualise=False)

    chain_tuner = ChainTuner(chain=obtained_chain,
                             task=task,
                             iterations=10)
    tuned_chain = chain_tuner.tune_chain(input_data=train_input,
                                         loss_function=mean_squared_error,
                                         loss_params={'squared': False})

    preds = fit_predict_for_chain(chain=tuned_chain,
                                  train_input=train_input,
                                  predict_input=predict_input)

    list_with_nodes = display_chain_info(obtained_chain)

    model_name = str(list_with_nodes)
    return preds, model_name
예제 #25
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     with_visualisation=True):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)

    time_limit_min = 10
    available_model_types = [
        'linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr'
    ]

    if max_window_size == 1:
        # unit test model
        available_model_types = ['linear', 'ridge']
        time_limit_min = 0.001

    # each possible single-model chain
    for model in available_model_types:
        chain = TsForecastingChain(PrimaryNode(model))

        chain.fit(input_data=dataset_to_train, verbose=False)
        calculate_validation_metric(chain.predict(dataset_to_validate),
                                    dataset_to_validate,
                                    is_visualise=with_visualisation,
                                    label=model)

    # static multiscale chain
    multiscale_chain = get_composite_multiscale_chain()

    multiscale_chain.fit(input_data=dataset_to_train, verbose=False)
    calculate_validation_metric(multiscale_chain.predict(dataset_to_validate),
                                dataset_to_validate,
                                is_visualise=with_visualisation,
                                label='Fixed multiscale')

    # static all-in-one ensemble chain
    ens_chain = get_ensemble_chain()
    ens_chain.fit(input_data=dataset_to_train, verbose=False)
    calculate_validation_metric(ens_chain.predict(dataset_to_validate),
                                dataset_to_validate,
                                is_visualise=with_visualisation,
                                label='Ensemble composite')

    # optimized ensemble chain
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=5,
        max_depth=2,
        pop_size=10,
        num_of_generations=10,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=time_limit_min),
        add_single_model_chains=False)

    builder = GPComposerBuilder(task=task_to_solve).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    chain = composer.compose_chain(data=dataset_to_train, is_visualise=False)
    chain.fit_from_scratch(input_data=dataset_to_train, verbose=False)

    if with_visualisation:
        ComposerVisualiser.visualise(chain)

    calculate_validation_metric(chain.predict(dataset_to_validate),
                                dataset_to_validate,
                                is_visualise=with_visualisation,
                                label='Automated ensemble')

    # optimized multiscale chain

    available_model_types_primary = ['trend_data_model', 'residual_data_model']

    available_model_types_secondary = [
        'linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr'
    ]

    available_model_types_all = available_model_types_primary + available_model_types_secondary

    composer_requirements = GPComposerRequirements(
        primary=available_model_types_all,
        secondary=available_model_types_secondary,
        max_arity=5,
        max_depth=2,
        pop_size=10,
        num_of_generations=30,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=time_limit_min))

    builder = GPComposerBuilder(task=task_to_solve).with_requirements(
        composer_requirements).with_metrics(
            metric_function).with_initial_chain(multiscale_chain)
    composer = builder.build()

    chain = composer.compose_chain(data=dataset_to_train, is_visualise=False)
    chain.fit_from_scratch(input_data=dataset_to_train, verbose=False)

    if with_visualisation:
        visualiser = ChainVisualiser()
        visualiser.visualise(chain)

    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate),
        dataset_to_validate,
        is_visualise=with_visualisation,
        label='Automated multiscale')

    return rmse_on_valid
예제 #26
0
def run_river_composer_experiment(file_path, init_pipeline, file_to_save,
                                  iterations=20, tuner=None):
    """ Function launch experiment for river level prediction. Composing and
    tuner processes are available for such experiment.

    :param file_path: path to the file with river level data
    :param init_pipeline: pipeline to start composing process
    :param file_to_save: path to the file and file name to save report
    :param iterations: amount of iterations to process
    :param tuner: if tuning after composing process is required or not. tuner -
    NodesTuner or PipelineTuner.
    """

    # Read dataframe and prepare train and test data
    data = InputData.from_csv(file_path, target_columns='level_station_2',
                              task=Task(TaskTypesEnum.regression),
                              columns_to_drop=['date'])
    train_input, predict_input = train_test_data_setup(data)
    y_data_test = np.array(predict_input.target)

    available_secondary_operations = ['ridge', 'lasso', 'dtreg',
                                      'xgbreg', 'adareg', 'knnreg',
                                      'linear', 'svr', 'poly_features',
                                      'scaling', 'ransac_lin_reg', 'rfe_lin_reg',
                                      'pca', 'ransac_non_lin_reg',
                                      'rfe_non_lin_reg', 'normalization']
    available_primary_operations = ['one_hot_encoding']

    # Report arrays
    obtained_pipelines = []
    depths = []
    maes = []
    for i in range(0, iterations):
        print(f'Iteration {i}\n')

        composer_requirements = GPComposerRequirements(
            primary=available_primary_operations,
            secondary=available_secondary_operations, max_arity=3,
            max_depth=8, pop_size=10, num_of_generations=5,
            crossover_prob=0.8, mutation_prob=0.8,
            timeout=datetime.timedelta(minutes=5))

        metric_function = MetricsRepository().metric_by_id(
            RegressionMetricsEnum.MAE)
        builder = GPComposerBuilder(task=data.task). \
            with_requirements(composer_requirements). \
            with_metrics(metric_function).with_initial_pipeline(init_pipeline)
        composer = builder.build()

        obtained_pipeline = composer.compose_pipeline(data=train_input, is_visualise=False)

        # Display info about obtained pipeline
        obtained_models, depth = get_pipeline_info(pipeline=obtained_pipeline)

        preds = fit_predict_for_pipeline(pipeline=obtained_pipeline,
                                         train_input=train_input,
                                         predict_input=predict_input)

        mse_value = mean_squared_error(y_data_test, preds, squared=False)
        mae_value = mean_absolute_error(y_data_test, preds)
        print(f'Obtained metrics for current iteration {i}:')
        print(f'RMSE - {mse_value:.2f}')
        print(f'MAE - {mae_value:.2f}\n')

        if tuner is not None:
            print(f'Start tuning process ...')
            pipeline_tuner = tuner(pipeline=obtained_pipeline, task=data.task,
                                   iterations=100)
            tuned_pipeline = pipeline_tuner.tune_pipeline(input_data=train_input,
                                                          loss_function=mean_absolute_error)

            preds_tuned = fit_predict_for_pipeline(pipeline=tuned_pipeline,
                                                   train_input=train_input,
                                                   predict_input=predict_input)

            mse_value = mean_squared_error(y_data_test, preds_tuned, squared=False)
            mae_value = mean_absolute_error(y_data_test, preds_tuned)

            print(f'Obtained metrics for current iteration {i} after tuning:')
            print(f'RMSE - {mse_value:.2f}')
            print(f'MAE - {mae_value:.2f}\n')

        obtained_pipelines.append(obtained_models)
        maes.append(mae_value)
        depths.append(depth)

    report = pd.DataFrame({'Pipeline': obtained_pipelines,
                           'Depth': depths,
                           'MAE': maes})
    report.to_csv(file_to_save, index=False)
예제 #27
0
def run_ts_forecasting_problem(forecast_length=50,
                               with_visualisation=True,
                               cv_folds=None) -> None:
    """ Function launch time series task with composing

    :param forecast_length: length of the forecast
    :param with_visualisation: is it needed to show the plots
    :param cv_folds: is it needed apply cross validation and what number
    of folds to use
    """
    file_path = '../cases/data/metocean/metocean_data_test.csv'

    df = pd.read_csv(file_path)
    time_series = np.array(df['sea_height'])

    # Train/test split
    train_part = time_series[:-forecast_length]
    test_part = time_series[-forecast_length:]

    # Prepare data for train and test
    train_input, predict_input, task = prepare_train_test_input(
        train_part, forecast_length)

    # Get pipeline with pre-defined structure
    init_pipeline = get_source_pipeline()

    # Init check
    preds = fit_predict_for_pipeline(pipeline=init_pipeline,
                                     train_input=train_input,
                                     predict_input=predict_input)
    display_validation_metric(predicted=preds,
                              real=test_part,
                              actual_values=time_series,
                              is_visualise=with_visualisation)

    # Get available_operations type
    primary_operations, secondary_operations = get_available_operations()

    # Composer parameters
    composer_requirements = GPComposerRequirements(
        primary=primary_operations,
        secondary=secondary_operations,
        max_arity=3,
        max_depth=8,
        pop_size=10,
        num_of_generations=10,
        crossover_prob=0.8,
        mutation_prob=0.8,
        timeout=datetime.timedelta(minutes=10),
        cv_folds=cv_folds,
        validation_blocks=3)

    mutation_types = [
        parameter_change_mutation, MutationTypesEnum.simple,
        MutationTypesEnum.reduce
    ]
    optimiser_parameters = GPGraphOptimiserParameters(
        mutation_types=mutation_types)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)
    builder = GPComposerBuilder(task=task). \
        with_optimiser_parameters(optimiser_parameters).\
        with_requirements(composer_requirements).\
        with_metrics(metric_function).with_initial_pipeline(init_pipeline)
    composer = builder.build()

    obtained_pipeline = composer.compose_pipeline(data=train_input,
                                                  is_visualise=False)

    ###################################
    # Obtained pipeline visualisation #
    ###################################
    if with_visualisation:
        obtained_pipeline.show()

    preds = fit_predict_for_pipeline(pipeline=obtained_pipeline,
                                     train_input=train_input,
                                     predict_input=predict_input)

    display_validation_metric(predicted=preds,
                              real=test_part,
                              actual_values=time_series,
                              is_visualise=with_visualisation)

    obtained_pipeline.print_structure()
예제 #28
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False,
        with_tuning=False,
        cache_path=None):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types = get_operations_for_task(task=task, mode='models')

    # the choice of the metric for the chain quality assessment during composition
    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    logger = default_log('FEDOT logger', verbose_level=4)
    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger)

    if cache_path:
        builder = builder.with_cache(cache_path)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=True)

    if with_tuning:
        # TODO Add tuning
        raise NotImplementedError(f'Tuning is not supported')

    chain_evo_composed.fit(input_data=dataset_to_compose)

    composer.history.write_composer_history_to_csv()

    if is_visualise:
        visualiser = ChainVisualiser()

        composer.log.debug('History visualization started')
        visualiser.visualise_history(composer.history)
        composer.log.debug('History visualization finished')

        composer.log.debug('Best chain visualization started')
        visualiser.visualise(chain_evo_composed)
        composer.log.debug('Best chain visualization finished')

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed
예제 #29
0
def run_ts_forecasting_problem(forecast_length=50,
                               with_visualisation=True) -> None:
    """ Function launch time series task with composing

    :param forecast_length: length of the forecast
    :param with_visualisation: is it needed to show the plots
    """
    file_path = '../cases/data/metocean/metocean_data_test.csv'

    df = pd.read_csv(file_path)
    time_series = np.array(df['sea_height'])

    # Train/test split
    train_part = time_series[:-forecast_length]
    test_part = time_series[-forecast_length:]

    # Prepare data for train and test
    train_input, predict_input, task = prepare_train_test_input(
        train_part, forecast_length)

    # Get chain with pre-defined structure
    init_chain = get_source_chain()

    # Init check
    preds = fit_predict_for_chain(chain=init_chain,
                                  train_input=train_input,
                                  predict_input=predict_input)
    display_validation_metric(predicted=preds,
                              real=test_part,
                              actual_values=time_series,
                              is_visualise=with_visualisation)

    # Get available_operations type
    primary_operations, secondary_operations = get_available_operations()

    # Composer parameters
    composer_requirements = GPComposerRequirements(
        primary=primary_operations,
        secondary=secondary_operations,
        max_arity=3,
        max_depth=8,
        pop_size=10,
        num_of_generations=15,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=10),
        allow_single_operations=False)

    mutation_types = [
        MutationTypesEnum.parameter_change, MutationTypesEnum.simple,
        MutationTypesEnum.reduce
    ]
    optimiser_parameters = GPChainOptimiserParameters(
        mutation_types=mutation_types)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.MAE)
    builder = GPComposerBuilder(task=task). \
        with_optimiser_parameters(optimiser_parameters).\
        with_requirements(composer_requirements).\
        with_metrics(metric_function).with_initial_chain(init_chain)
    composer = builder.build()

    obtained_chain = composer.compose_chain(data=train_input,
                                            is_visualise=False)

    ################################
    # Obtained chain visualisation #
    ################################
    if with_visualisation:
        visualiser = ChainVisualiser()
        visualiser.visualise(obtained_chain)

    preds = fit_predict_for_chain(chain=obtained_chain,
                                  train_input=train_input,
                                  predict_input=predict_input)

    display_validation_metric(predicted=preds,
                              real=test_part,
                              actual_values=time_series,
                              is_visualise=with_visualisation)

    display_chain_info(obtained_chain)
예제 #30
0
def run_experiment(file_path, init_chain):
    # Read dataframe and prepare train and test data
    df = pd.read_csv(file_path)
    features = np.array(df[['level_station_1', 'mean_temp', 'month',
                            'precip']])
    target = np.array(df['level_station_2'])
    x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
        features, target, test_size=0.2, shuffle=True, random_state=10)
    y_data_test = np.ravel(y_data_test)

    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_data_train)),
                            features=x_data_train,
                            target=y_data_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_data_test)),
                              features=x_data_test,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.table)

    available_operations_types = [
        'ridge', 'lasso', 'dtreg', 'xgbreg', 'adareg', 'knnreg', 'linear',
        'svr', 'poly_features', 'scaling', 'ransac_lin_reg', 'rfe_lin_reg',
        'pca', 'ransac_non_lin_reg', 'rfe_non_lin_reg', 'normalization'
    ]

    composer_requirements = GPComposerRequirements(
        primary=['one_hot_encoding'],
        secondary=available_operations_types,
        max_arity=3,
        max_depth=8,
        pop_size=10,
        num_of_generations=5,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=5),
        allow_single_operations=True)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.MAE)
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_initial_chain(init_chain)
    composer = builder.build()

    obtained_chain = composer.compose_chain(data=train_input,
                                            is_visualise=False)

    # Display info about obtained chain
    obtained_models, depth = get_chain_info(chain=obtained_chain)

    preds = fit_predict_for_chain(chain=obtained_chain,
                                  train_input=train_input,
                                  predict_input=predict_input)

    mse_value = mean_squared_error(y_data_test, preds, squared=False)
    mae_value = mean_absolute_error(y_data_test, preds)
    print(f'RMSE - {mse_value:.2f}')
    print(f'MAE - {mae_value:.2f}\n')