示例#1
0
文件: model.py 项目: STATAN/FEDOT
def _eval_strategy_for_task(model_type: str,
                            task_type_for_data: TaskTypesEnum):
    models_repo = ModelTypesRepository()
    model_info = models_repo.model_info_by_id(model_type)

    task_type_for_model = task_type_for_data
    task_types_acceptable_for_model = model_info.task_type

    # if the model can't be used directly for the task type from data
    if task_type_for_model not in task_types_acceptable_for_model:
        # search the supplementary task types, that can be included in chain which solves original task
        globally_compatible_task_types = compatible_task_types(
            task_type_for_model)
        compatible_task_types_acceptable_for_model = list(
            set(task_types_acceptable_for_model).intersection(
                set(globally_compatible_task_types)))
        if len(compatible_task_types_acceptable_for_model) == 0:
            raise ValueError(
                f'Model {model_type} can not be used as a part of {task_type_for_model}.'
            )
        task_type_for_model = compatible_task_types_acceptable_for_model[0]

    strategy = models_repo.model_info_by_id(model_type).current_strategy(
        task_type_for_model)
    return strategy
示例#2
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type, tags=['simple'])

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
示例#3
0
def _eval_strategy_for_task(model_type: ModelTypesIdsEnum,
                            task_type_for_data: TaskTypesEnum):
    strategies_for_tasks = {
        MachineLearningTasksEnum.classification:
        [SkLearnClassificationStrategy, AutoMLEvaluationStrategy],
        MachineLearningTasksEnum.regression: [SkLearnRegressionStrategy],
        MachineLearningTasksEnum.auto_regression:
        [StatsModelsAutoRegressionStrategy],
        MachineLearningTasksEnum.clustering: [SkLearnClusteringStrategy]
    }

    models_for_strategies = {
        SkLearnClassificationStrategy: [
            ModelTypesIdsEnum.xgboost, ModelTypesIdsEnum.knn,
            ModelTypesIdsEnum.logit, ModelTypesIdsEnum.dt,
            ModelTypesIdsEnum.rf, ModelTypesIdsEnum.mlp, ModelTypesIdsEnum.lda,
            ModelTypesIdsEnum.qda
        ],
        AutoMLEvaluationStrategy:
        [ModelTypesIdsEnum.tpot, ModelTypesIdsEnum.h2o],
        SkLearnClusteringStrategy: [ModelTypesIdsEnum.kmeans],
        SkLearnRegressionStrategy: [
            ModelTypesIdsEnum.linear, ModelTypesIdsEnum.ridge,
            ModelTypesIdsEnum.lasso
        ],
        StatsModelsAutoRegressionStrategy:
        [ModelTypesIdsEnum.ar, ModelTypesIdsEnum.arima]
    }

    models_repo = ModelTypesRepository()
    _, model_info = models_repo.search_models(desired_ids=[model_type])

    task_type_for_model = task_type_for_data
    task_types_acceptable_for_model = model_info[0].task_type

    # if the model can't be used directly for the task type from data
    if task_type_for_model not in task_types_acceptable_for_model:
        # search the supplementary task types, that can be included in chain which solves original task
        globally_compatible_task_types = compatible_task_types(
            task_type_for_model)
        compatible_task_types_acceptable_for_model = list(
            set(task_types_acceptable_for_model).intersection(
                set(globally_compatible_task_types)))
        if len(compatible_task_types_acceptable_for_model) == 0:
            raise ValueError(
                f'Model {model_type} can not be used as a part of {task_type_for_model}.'
            )
        task_type_for_model = compatible_task_types_acceptable_for_model[0]

    eval_strategies = strategies_for_tasks[task_type_for_model]

    for strategy in eval_strategies:
        if model_type in models_for_strategies[strategy]:
            eval_strategy = strategy(model_type)
            return eval_strategy

    return None
示例#4
0
def test_parameter_free_composer_build_chain_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=TaskTypesEnum.classification)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    opt_params = GPChainOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metric_function).with_optimiser_parameters(opt_params)
    gp_composer = builder.build()
    chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose)

    chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = chain_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)

    assert roc_on_valid_gp_composed > 0.6
示例#5
0
def test_random_composer(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data

    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=TaskTypesEnum.classification)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    random_composer = RandomSearchComposer(iter_num=1)
    req = ComposerRequirements(primary=available_model_types,
                               secondary=available_model_types)
    chain_random_composed = random_composer.compose_chain(
        data=dataset_to_compose,
        initial_chain=None,
        composer_requirements=req,
        metrics=metric_function)
    chain_random_composed.fit_from_scratch(input_data=dataset_to_compose)

    predicted_random_composed = chain_random_composed.predict(
        dataset_to_validate)

    roc_on_valid_random_composed = roc_auc(
        y_true=dataset_to_validate.target,
        y_score=predicted_random_composed.predict)

    assert roc_on_valid_random_composed > 0.6
示例#6
0
def test_search_in_repository_by_tag_and_metainfo_correct():
    with ModelTypesRepository(mocked_path()) as repo:
        model_names, _ = repo.suitable_model(
            task_type=TaskTypesEnum.regression, tags=['ml'])

        assert 'linear' in model_names
        assert len(model_names) == 3
示例#7
0
 def set_default_composer_params(self):
     if not self._composer.composer_requirements:
         models, _ = ModelTypesRepository().suitable_model(task_type=self.task.task_type)
         self._composer.composer_requirements = GPComposerRequirements(primary=models, secondary=models)
     if not self._composer.metrics:
         metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)
         if self.task.task_type in (TaskTypesEnum.regression, TaskTypesEnum.ts_forecasting):
             metric_function = MetricsRepository().metric_by_id(RegressionMetricsEnum.RMSE)
         self._composer.metrics = metric_function
示例#8
0
def test_search_in_repository_by_tag_correct():
    with ModelTypesRepository(mocked_path()) as repo:
        model_names, _ = repo.models_with_tag(tags=['simple', 'linear'],
                                              is_full_match=True)
        assert {'linear', 'logit', 'lasso', 'ridge'}.issubset(model_names)
        assert len(model_names) == 4

        model_names, _ = repo.models_with_tag(tags=['simple', 'linear'])
        assert {'linear', 'logit', 'knn', 'lda', 'lasso',
                'ridge'}.issubset(model_names)
        assert len(model_names) == 6

        model_names, _ = repo.models_with_tag(tags=['non_real_tag'])
        assert len(model_names) == 0
示例#9
0
def test_chain_from_automl_example():
    project_root_path = str(project_root())
    experimental_repo_path = os.path.join(
        project_root_path,
        'fedot/core/repository/data/model_repository_with_automl.json')
    with ModelTypesRepository(experimental_repo_path) as _:
        file_path_train = os.path.join(project_root_path,
                                       'test/data/simple_classification.csv')
        file_path_test = file_path_train

        auc = run_chain_from_automl(file_path_train,
                                    file_path_test,
                                    max_run_time=timedelta(seconds=1))

    assert auc > 0.5
def run_credit_scoring_problem(train_file_path, test_file_path,
                               max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
                               gp_optimiser_params: Optional[GPChainOptimiserParameters] = None, pop_size=None,
                               generations=None):
    dataset_to_compose = InputData.from_csv(train_file_path)
    dataset_to_validate = InputData.from_csv(test_file_path)

    available_model_types, _ = ModelTypesRepository(). \
        suitable_model(task_type=TaskTypesEnum.classification)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC)

    if gp_optimiser_params:
        optimiser_parameters = gp_optimiser_params
    else:
        selection_types = [SelectionTypesEnum.tournament]
        crossover_types = [CrossoverTypesEnum.subtree]
        mutation_types = [MutationTypesEnum.simple, MutationTypesEnum.growth, MutationTypesEnum.reduce]
        regularization_type = RegularizationTypesEnum.decremental
        optimiser_parameters = GPChainOptimiserParameters(selection_types=selection_types,
                                                          crossover_types=crossover_types,
                                                          mutation_types=mutation_types,
                                                          regularization_type=regularization_type)
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types, max_arity=4,
        max_depth=3, pop_size=pop_size, num_of_generations=generations,
        crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time)

    # Create GP-based composer
    composer = GPComposer()

    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                initial_chain=None,
                                                composer_requirements=composer_requirements,
                                                metrics=metric_function, optimiser_parameters=optimiser_parameters,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    roc_on_valid_evo_composed = calculate_validation_metric(chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed, chain_evo_composed, composer
示例#11
0
def test_gp_composer_builder():
    task = Task(TaskTypesEnum.classification)

    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=5,
        num_of_generations=4,
        crossover_prob=0.8,
        mutation_prob=1,
        max_lead_time=datetime.timedelta(minutes=5))

    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    builder_with_custom_params = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    composer_with_custom_params = builder_with_custom_params.build()

    assert composer_with_custom_params.optimiser.parameters.genetic_scheme_type == scheme_type
    assert composer_with_custom_params.metrics == metric_function
    assert composer_with_custom_params.composer_requirements.pop_size == 5
    assert composer_with_custom_params.composer_requirements.mutation_prob == 1

    builder_with_default_params = GPComposerBuilder(task=task)
    composer_with_default_params = builder_with_default_params.build()

    default_metric = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC.ROCAUC_penalty)

    assert composer_with_default_params.optimiser.parameters.genetic_scheme_type == GeneticSchemeTypesEnum.generational
    assert composer_with_default_params.metrics == default_metric
    assert composer_with_default_params.composer_requirements.pop_size == 20
    assert composer_with_default_params.composer_requirements.mutation_prob == 0.8
示例#12
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False,
        with_tuning=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC_penalty)

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=True)

    if with_tuning:
        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50, verbose=True)

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    if is_visualise:
        visualiser = ChainVisualiser()

        composer.log.info('History visualization started')
        visualiser.visualise_history(composer.history)
        composer.log.info('History visualization finished')
        composer.history.write_composer_history_to_csv()

        composer.log.info('Best chain visualization started')
        visualiser.visualise(chain_evo_composed)
        composer.log.info('Best chain visualization finished')

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed
示例#13
0
def run_fedot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task_type = params.task

    if task_type == TaskTypesEnum.classification:
        metric = ClassificationMetricsEnum.ROCAUC
    elif task_type == TaskTypesEnum.regression:
        metric = RegressionMetricsEnum.RMSE
    else:
        raise NotImplementedError()

    metric_func = MetricsRepository().metric_by_id(metric)

    task = Task(task_type)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    models_hyperparameters = get_models_hyperparameters()['FEDOT']
    cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS']

    saved_model_name = f'fedot_{case_label}_{task_type.name}_{cur_lead_time}_{metric.name}'
    loaded_model = load_fedot_model(saved_model_name)

    if not loaded_model:
        generations = models_hyperparameters['GENERATIONS']
        population_size = models_hyperparameters['POPULATION_SIZE']

        # the search of the models provided by the framework that can be used as nodes in a chain'
        models_repo = ModelTypesRepository()
        available_model_types, _ = models_repo.suitable_model(task.task_type)

        heavy_models = ['svc', 'multinb', 'tfidf', 'qda']
        available_model_types = [
            model for model in available_model_types
            if model not in heavy_models
        ]

        # the choice and initialisation of the GP search
        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=2,
            pop_size=population_size,
            num_of_generations=generations,
            crossover_prob=0.8,
            mutation_prob=0.8,
            max_lead_time=datetime.timedelta(minutes=cur_lead_time),
            add_single_model_chains=True)

        # Create GP-based composer
        builder = GPComposerBuilder(task).with_requirements(
            composer_requirements).with_metrics(metric_func)
        gp_composer = builder.build()

        chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose)

        chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
        save_fedot_model(chain_gp_composed, saved_model_name)
    else:
        chain_gp_composed = loaded_model

    evo_predicted = chain_gp_composed.predict(dataset_to_validate)
    evo_predicted_labels = chain_gp_composed.predict(dataset_to_validate,
                                                     output_mode='labels')

    return dataset_to_validate.target, evo_predicted.predict, evo_predicted_labels.predict
示例#14
0
文件: model.py 项目: STATAN/FEDOT
 def acceptable_task_types(self):
     model_info = ModelTypesRepository().model_info_by_id(self.model_type)
     return model_info.task_type
示例#15
0
def test_lazy_load():
    with ModelTypesRepository(mocked_path()) as repo:
        repo_second = ModelTypesRepository()

        assert repo._repo == repo_second._repo
示例#16
0
文件: model.py 项目: STATAN/FEDOT
 def metadata(self) -> ModelMetaInfo:
     model_info = ModelTypesRepository().model_info_by_id(self.model_type)
     if not model_info:
         raise ValueError(f'Model {self.model_type} not found')
     return model_info
示例#17
0
                                         return_df: bool = False):
    df = pd.read_excel(file_path)
    train, test = split_data(df)
    file_dir_name = file_path.replace('.', '/').split('/')[-2]
    file_csv_name = f'{file_dir_name}.csv'
    directory_names = ['examples', 'data', file_dir_name]
    ensure_directory_exists(directory_names)
    if return_df:
        path = os.path.join(directory_names[0], directory_names[1],
                            directory_names[2], file_csv_name)
        full_file_path = os.path.join(str(project_root()), path)
        save_file_to_csv(df, full_file_path)
        return df, full_file_path
    else:
        full_train_file_path, full_test_file_path = get_split_data_paths(
            directory_names)
        save_file_to_csv(train, full_train_file_path)
        save_file_to_csv(train, full_test_file_path)
        return full_train_file_path, full_test_file_path


def print_models_info(repository: ModelTypesRepository,
                      task=TaskTypesEnum.classification):
    for model in repository.models:
        print(f'{model.id}, {model.current_strategy(task)}, '
              f'{model.current_strategy(task)(model.id).implementation_info}')


if __name__ == '__main__':
    print_models_info(ModelTypesRepository())
示例#18
0
def run(dataset, config):
    log.info("\n**** FEDOT ****\n")

    is_classification = config.type == 'classification'
    # Mapping of benchmark metrics to FEDOT metrics
    metrics_mapping = dict(acc='accuracy',
                           auc='roc_auc',
                           f1='f1',
                           logloss='neg_log_loss',
                           mae='neg_mean_absolute_error',
                           mse='neg_mean_squared_error',
                           msle='neg_mean_squared_log_error',
                           r2='r2',
                           rmse='neg_mean_squared_error')
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    if is_classification:
        metric = ClassificationMetricsEnum.ROCAUC
        task_type = TaskTypesEnum.classification
    else:
        metric = RegressionMetricsEnum.RMSE
        task_type = TaskTypesEnum.regression

    task = Task(task_type)

    x_train = dataset.train.X_enc
    y_train = dataset.train.y_enc

    x_test = dataset.test.X_enc

    x_train, y_train = shuffle(dataset.train.X_enc,
                               dataset.train.y_enc,
                               random_state=0)

    if len(y_train.shape) > 1 and y_train.shape[1] == 1:
        y_train = np.squeeze(y_train, axis=1)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    dataset_to_compose = \
        InputData(idx=[_ for _ in range(len(y_train))],
                  features=x_train,
                  target=y_train,
                  task=task,
                  data_type=DataTypesEnum.table)

    dataset_to_test = \
        InputData(idx=[_ for _ in range(len(y_train))],
                  features=x_test,
                  target=None,
                  task=task,
                  data_type=DataTypesEnum.table)

    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    log.info(
        'Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)
    runtime_min = (config.max_runtime_seconds / 60)

    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository().metric_by_id(metric)

    # Create GP-based composer
    composer = GPComposer()

    if False:
        # the choice and initialisation of the GP search
        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=3,
            max_lead_time=datetime.timedelta(minutes=runtime_min * 0.8))

        # GP optimiser parameters choice
        scheme_type = GeneticSchemeTypesEnum.parameter_free
        optimiser_parameters = GPChainOptimiserParameters(
            genetic_scheme_type=scheme_type)

        # Create builder for composer and set composer params
        builder = GPComposerBuilder(
            task=task).with_requirements(composer_requirements).with_metrics(
                metric_function).with_optimiser_parameters(
                    optimiser_parameters)

        composer = builder.build()

        # the optimal chain generation by composition - the most time-consuming task
        chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                    is_visualise=False)

    else:
        if is_classification:
            chain_evo_composed = Chain(PrimaryNode('logit'))
        else:
            chain_evo_composed = Chain(PrimaryNode('lasso'))

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False)

    log.info('Predicting on the test set.')
    y_test = dataset.test.y_enc
    predictions = chain_evo_composed.predict(dataset_to_test,
                                             output_mode='labels').predict

    if not is_classification:
        probabilities = None
    else:
        probabilities = chain_evo_composed.predict(
            dataset_to_test, output_mode='full_probs').predict

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=1,
                  training_duration=1)